2
0

SieveScanner.php 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. <?php namespace Sieve;
  2. include_once('SieveToken.php');
  3. class SieveScanner
  4. {
  5. public function __construct(&$script)
  6. {
  7. if ($script === null)
  8. return;
  9. $this->tokenize($script);
  10. }
  11. public function setPassthroughFunc($callback)
  12. {
  13. if ($callback == null || is_callable($callback))
  14. $this->ptFn_ = $callback;
  15. }
  16. public function tokenize(&$script)
  17. {
  18. $pos = 0;
  19. $line = 1;
  20. $scriptLength = mb_strlen($script);
  21. $unprocessedScript = $script;
  22. //create one regex to find the right match
  23. //avoids looping over all possible tokens: increases performance
  24. $nameToType = [];
  25. $regex = [];
  26. // chr(65) == 'A'
  27. $i = 65;
  28. foreach ($this->tokenMatch_ as $type => $subregex) {
  29. $nameToType[chr($i)] = $type;
  30. $regex[] = "(?P<". chr($i) . ">^$subregex)";
  31. $i++;
  32. }
  33. $regex = '/' . join('|', $regex) . '/';
  34. while ($pos < $scriptLength)
  35. {
  36. if (preg_match($regex, $unprocessedScript, $match)) {
  37. // only keep the group that match and we only want matches with group names
  38. // we can use the group name to find the token type using nameToType
  39. $filterMatch = array_filter(array_filter($match), 'is_string', ARRAY_FILTER_USE_KEY);
  40. // the first element in filterMatch will contain the matched group and the key will be the name
  41. $type = $nameToType[key($filterMatch)];
  42. $currentMatch = current($filterMatch);
  43. //create the token
  44. $token = new SieveToken($type, $currentMatch, $line);
  45. $this->tokens_[] = $token;
  46. if ($type == SieveToken::Unknown)
  47. return;
  48. // just remove the part that we parsed: don't extract the new substring using script length
  49. // as mb_strlen is \theta(pos) (it's linear in the position)
  50. $matchLength = mb_strlen($currentMatch);
  51. $unprocessedScript = mb_substr($unprocessedScript, $matchLength);
  52. $pos += $matchLength;
  53. $line += mb_substr_count($currentMatch, "\n");
  54. } else {
  55. $this->tokens_[] = new SieveToken(SieveToken::Unknown, '', $line);
  56. return;
  57. }
  58. }
  59. $this->tokens_[] = new SieveToken(SieveToken::ScriptEnd, '', $line);
  60. }
  61. public function nextTokenIs($type)
  62. {
  63. return $this->peekNextToken()->is($type);
  64. }
  65. public function peekNextToken()
  66. {
  67. $offset = 0;
  68. do {
  69. $next = $this->tokens_[$this->tokenPos_ + $offset++];
  70. } while ($next->is(SieveToken::Comment|SieveToken::Whitespace));
  71. return $next;
  72. }
  73. public function nextToken()
  74. {
  75. $token = $this->tokens_[$this->tokenPos_++];
  76. while ($token->is(SieveToken::Comment|SieveToken::Whitespace))
  77. {
  78. if ($this->ptFn_ != null)
  79. call_user_func($this->ptFn_, $token);
  80. $token = $this->tokens_[$this->tokenPos_++];
  81. }
  82. return $token;
  83. }
  84. protected $ptFn_ = null;
  85. protected $tokenPos_ = 0;
  86. protected $tokens_ = array();
  87. protected $tokenMatch_ = array (
  88. SieveToken::LeftBracket => '\[',
  89. SieveToken::RightBracket => '\]',
  90. SieveToken::BlockStart => '\{',
  91. SieveToken::BlockEnd => '\}',
  92. SieveToken::LeftParenthesis => '\(',
  93. SieveToken::RightParenthesis => '\)',
  94. SieveToken::Comma => ',',
  95. SieveToken::Semicolon => ';',
  96. SieveToken::Whitespace => '[ \r\n\t]+',
  97. SieveToken::Tag => ':[[:alpha:]_][[:alnum:]_]*(?=\b)',
  98. /*
  99. " # match a quotation mark
  100. ( # start matching parts that include an escaped quotation mark
  101. ([^"]*[^"\\\\]) # match a string without quotation marks and not ending with a backlash
  102. ? # this also includes the empty string
  103. (\\\\\\\\)* # match any groups of even number of backslashes
  104. # (thus the character after these groups are not escaped)
  105. \\\\" # match an escaped quotation mark
  106. )* # accept any number of strings that end with an escaped quotation mark
  107. [^"]* # accept any trailing part that does not contain any quotation marks
  108. " # end of the quoted string
  109. */
  110. SieveToken::QuotedString => '"(([^"]*[^"\\\\])?(\\\\\\\\)*\\\\")*[^"]*"',
  111. SieveToken::Number => '[[:digit:]]+(?:[KMG])?(?=\b)',
  112. SieveToken::Comment => '(?:\/\*(?:[^\*]|\*(?=[^\/]))*\*\/|#[^\r\n]*\r?(\n|$))',
  113. SieveToken::MultilineString => 'text:[ \t]*(?:#[^\r\n]*)?\r?\n(\.[^\r\n]+\r?\n|[^\.][^\r\n]*\r?\n)*\.\r?(\n|$)',
  114. SieveToken::Identifier => '[[:alpha:]_][[:alnum:]_]*(?=\b)',
  115. SieveToken::Unknown => '[^ \r\n\t]+'
  116. );
  117. }