123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- <?php namespace Sieve;
- include_once('SieveToken.php');
- class SieveScanner
- {
- public function __construct(&$script)
- {
- if ($script === null)
- return;
- $this->tokenize($script);
- }
- public function setPassthroughFunc($callback)
- {
- if ($callback == null || is_callable($callback))
- $this->ptFn_ = $callback;
- }
- public function tokenize(&$script)
- {
- $pos = 0;
- $line = 1;
- $scriptLength = mb_strlen($script);
- $unprocessedScript = $script;
- //create one regex to find the right match
- //avoids looping over all possible tokens: increases performance
- $nameToType = [];
- $regex = [];
- // chr(65) == 'A'
- $i = 65;
- foreach ($this->tokenMatch_ as $type => $subregex) {
- $nameToType[chr($i)] = $type;
- $regex[] = "(?P<". chr($i) . ">^$subregex)";
- $i++;
- }
- $regex = '/' . join('|', $regex) . '/';
- while ($pos < $scriptLength)
- {
- if (preg_match($regex, $unprocessedScript, $match)) {
- // only keep the group that match and we only want matches with group names
- // we can use the group name to find the token type using nameToType
- $filterMatch = array_filter(array_filter($match), 'is_string', ARRAY_FILTER_USE_KEY);
- // the first element in filterMatch will contain the matched group and the key will be the name
- $type = $nameToType[key($filterMatch)];
- $currentMatch = current($filterMatch);
- //create the token
- $token = new SieveToken($type, $currentMatch, $line);
- $this->tokens_[] = $token;
- if ($type == SieveToken::Unknown)
- return;
- // just remove the part that we parsed: don't extract the new substring using script length
- // as mb_strlen is \theta(pos) (it's linear in the position)
- $matchLength = mb_strlen($currentMatch);
- $unprocessedScript = mb_substr($unprocessedScript, $matchLength);
- $pos += $matchLength;
- $line += mb_substr_count($currentMatch, "\n");
- } else {
- $this->tokens_[] = new SieveToken(SieveToken::Unknown, '', $line);
- return;
- }
- }
- $this->tokens_[] = new SieveToken(SieveToken::ScriptEnd, '', $line);
- }
- public function nextTokenIs($type)
- {
- return $this->peekNextToken()->is($type);
- }
- public function peekNextToken()
- {
- $offset = 0;
- do {
- $next = $this->tokens_[$this->tokenPos_ + $offset++];
- } while ($next->is(SieveToken::Comment|SieveToken::Whitespace));
- return $next;
- }
- public function nextToken()
- {
- $token = $this->tokens_[$this->tokenPos_++];
- while ($token->is(SieveToken::Comment|SieveToken::Whitespace))
- {
- if ($this->ptFn_ != null)
- call_user_func($this->ptFn_, $token);
- $token = $this->tokens_[$this->tokenPos_++];
- }
- return $token;
- }
- protected $ptFn_ = null;
- protected $tokenPos_ = 0;
- protected $tokens_ = array();
- protected $tokenMatch_ = array (
- SieveToken::LeftBracket => '\[',
- SieveToken::RightBracket => '\]',
- SieveToken::BlockStart => '\{',
- SieveToken::BlockEnd => '\}',
- SieveToken::LeftParenthesis => '\(',
- SieveToken::RightParenthesis => '\)',
- SieveToken::Comma => ',',
- SieveToken::Semicolon => ';',
- SieveToken::Whitespace => '[ \r\n\t]+',
- SieveToken::Tag => ':[[:alpha:]_][[:alnum:]_]*(?=\b)',
- /*
- " # match a quotation mark
- ( # start matching parts that include an escaped quotation mark
- ([^"]*[^"\\\\]) # match a string without quotation marks and not ending with a backlash
- ? # this also includes the empty string
- (\\\\\\\\)* # match any groups of even number of backslashes
- # (thus the character after these groups are not escaped)
- \\\\" # match an escaped quotation mark
- )* # accept any number of strings that end with an escaped quotation mark
- [^"]* # accept any trailing part that does not contain any quotation marks
- " # end of the quoted string
- */
- SieveToken::QuotedString => '"(([^"]*[^"\\\\])?(\\\\\\\\)*\\\\")*[^"]*"',
- SieveToken::Number => '[[:digit:]]+(?:[KMG])?(?=\b)',
- SieveToken::Comment => '(?:\/\*(?:[^\*]|\*(?=[^\/]))*\*\/|#[^\r\n]*\r?(\n|$))',
- SieveToken::MultilineString => 'text:[ \t]*(?:#[^\r\n]*)?\r?\n(\.[^\r\n]+\r?\n|[^\.][^\r\n]*\r?\n)*\.\r?(\n|$)',
- SieveToken::Identifier => '[[:alpha:]_][[:alnum:]_]*(?=\b)',
- SieveToken::Unknown => '[^ \r\n\t]+'
- );
- }
|