118 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			118 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| 
 | |
| /**
 | |
|  * Iterates individual words of DOM text and CDATA nodes
 | |
|  * while keeping track of their position in the document.
 | |
|  *
 | |
|  * Example:
 | |
|  *
 | |
|  *  $doc = new DOMDocument();
 | |
|  *  $doc->load('example.xml');
 | |
|  *  foreach(new DOMWordsIterator($doc) as $word) echo $word;
 | |
|  *
 | |
|  * @author pjgalbraith http://www.pjgalbraith.com
 | |
|  * @author porneL http://pornel.net (based on DOMLettersIterator available at http://pornel.net/source/domlettersiterator.php)
 | |
|  * @license Public Domain
 | |
|  *
 | |
|  */
 | |
| 
 | |
| final class DOMWordsIterator implements Iterator {
 | |
|     
 | |
|     private $start, $current;
 | |
|     private $offset, $key, $words;
 | |
| 
 | |
|     /**
 | |
|      * expects DOMElement or DOMDocument (see DOMDocument::load and DOMDocument::loadHTML)
 | |
|      */
 | |
|     function __construct(DOMNode $el)
 | |
|     {
 | |
|         if ($el instanceof DOMDocument) $this->start = $el->documentElement;
 | |
|         else if ($el instanceof DOMElement) $this->start = $el;
 | |
|         else throw new InvalidArgumentException("Invalid arguments, expected DOMElement or DOMDocument");
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Returns position in text as DOMText node and character offset.
 | |
|      * (it's NOT a byte offset, you must use mb_substr() or similar to use this offset properly).
 | |
|      * node may be NULL if iterator has finished.
 | |
|      *
 | |
|      * @return array
 | |
|      */
 | |
|     function currentWordPosition()
 | |
|     {
 | |
|         return array($this->current, $this->offset, $this->words);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Returns DOMElement that is currently being iterated or NULL if iterator has finished.
 | |
|      *
 | |
|      * @return DOMElement
 | |
|      */
 | |
|     function currentElement()
 | |
|     {
 | |
|         return $this->current ? $this->current->parentNode : NULL;
 | |
|     }
 | |
|     
 | |
|     // Implementation of Iterator interface
 | |
|     function key()
 | |
|     {
 | |
|         return $this->key;
 | |
|     }
 | |
|     
 | |
|     function next()
 | |
|     {
 | |
|         if (!$this->current) return;
 | |
| 
 | |
|         if ($this->current->nodeType == XML_TEXT_NODE || $this->current->nodeType == XML_CDATA_SECTION_NODE)
 | |
|         {
 | |
|             if ($this->offset == -1)
 | |
|             {
 | |
|                 // fastest way to get individual Unicode chars and does not require mb_* functions
 | |
|                 //preg_match_all('/./us',$this->current->textContent,$m); $this->words = $m[0];
 | |
|                 $this->words = preg_split("/[\n\r\t ]+/", $this->current->textContent, -1, PREG_SPLIT_NO_EMPTY|PREG_SPLIT_OFFSET_CAPTURE);
 | |
|             }
 | |
|             $this->offset++;
 | |
|             
 | |
|             if ($this->offset < count($this->words)) { 
 | |
|                 $this->key++;
 | |
|                 return;
 | |
|             }
 | |
|             $this->offset = -1;
 | |
|         }
 | |
| 
 | |
|         while($this->current->nodeType == XML_ELEMENT_NODE && $this->current->firstChild)
 | |
|         {
 | |
|             $this->current = $this->current->firstChild;
 | |
|             if ($this->current->nodeType == XML_TEXT_NODE || $this->current->nodeType == XML_CDATA_SECTION_NODE) return $this->next();
 | |
|         }
 | |
| 
 | |
|         while(!$this->current->nextSibling && $this->current->parentNode)
 | |
|         {
 | |
|             $this->current = $this->current->parentNode;
 | |
|             if ($this->current === $this->start) {$this->current = NULL; return;}
 | |
|         }
 | |
| 
 | |
|         $this->current = $this->current->nextSibling;
 | |
| 
 | |
|         return $this->next();
 | |
|     }
 | |
| 
 | |
|     function current()
 | |
|     {
 | |
|         if ($this->current) return $this->words[$this->offset][0];
 | |
|         return NULL;
 | |
|     }
 | |
| 
 | |
|     function valid()
 | |
|     {
 | |
|         return !!$this->current;
 | |
|     }
 | |
| 
 | |
|     function rewind()
 | |
|     {
 | |
|         $this->offset = -1; $this->words = array();
 | |
|         $this->current = $this->start;
 | |
|         $this->next();
 | |
|     }
 | |
| }
 |