Skip to content

Commit afed83c

Browse files
committed
Implement Handlebars lexer
This should produce the exact same tokens for any template as the official Handlebars JS implementation.
1 parent cb148cc commit afed83c

File tree

5 files changed

+364
-0
lines changed

5 files changed

+364
-0
lines changed

src/Lexer.php

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars;
4+
5+
use DevTheorem\Handlebars\Phlexer\Phlexer;
6+
use DevTheorem\Handlebars\Phlexer\Rule;
7+
8+
/**
9+
* Implements the same lexical tokenization from
10+
* https://github.com/handlebars-lang/handlebars-parser/blob/master/src/handlebars.l
11+
* (as of 2025-01-07).
12+
*/
13+
final class Lexer extends Phlexer
14+
{
15+
public function __construct()
16+
{
17+
$LEFT_STRIP = $RIGHT_STRIP = '~';
18+
$LOOKAHEAD = '[=~}\\s\\/.)\\]|]';
19+
$LITERAL_LOOKAHEAD = '[~}\\s)\\]]';
20+
21+
/*
22+
* ID is the inverse of control characters.
23+
* Control characters ranges:
24+
* [\s] Whitespace
25+
* [!"#%-,\./] !, ", #, %, &, ', (, ), *, +, ,, ., /, Exceptions in range: $, -
26+
* [;->@] ;, <, =, >, @, Exceptions in range: :, ?
27+
* [\[-\^`] [, \, ], ^, `, Exceptions in range: _
28+
* [\{-~] {, |, }, ~
29+
*/
30+
$CTRL_INVERSE = '[^\\s!"#%-,\\.\\/;->@\\[-\\^`\\{-~]+';
31+
$ID = $CTRL_INVERSE . '(?=' . $LOOKAHEAD . ')';
32+
33+
parent::__construct([
34+
new Rule([], '[^\\x00]*?(?={{)', function () {
35+
if (str_ends_with($this->yytext, "\\\\")) {
36+
$this->strip(0, 1);
37+
$this->pushState('mu');
38+
} elseif (str_ends_with($this->yytext, "\\")) {
39+
$this->strip(0, 1);
40+
$this->pushState('emu');
41+
} else {
42+
$this->pushState('mu');
43+
}
44+
45+
return $this->yytext !== '' ? 'CONTENT' : null;
46+
}),
47+
48+
new Rule([], '[^\\x00]+', fn() => 'CONTENT'),
49+
50+
// marks CONTENT up to the next mustache or escaped mustache
51+
new Rule(['emu'], '[^\\x00]{2,}?(?={{|\\\\{{|\\\\\\\\{{|\\Z)', function () {
52+
$this->popState();
53+
return 'CONTENT';
54+
}),
55+
56+
// nested raw block will create stacked 'raw' condition
57+
new Rule(['raw'], '{{{{(?=[^\\/])', function () {
58+
$this->pushState('raw');
59+
return 'CONTENT';
60+
}),
61+
62+
new Rule(['raw'], '{{{{\\/' . $CTRL_INVERSE . '(?=[=}\\s\\/.])}}}}', function () {
63+
$this->popState();
64+
65+
if ($this->topState() === 'raw') {
66+
return 'CONTENT';
67+
} else {
68+
$this->strip(5, 9);
69+
return 'END_RAW_BLOCK';
70+
}
71+
}),
72+
new Rule(['raw'], '[^\\x00]+?(?={{{{)', fn() => 'CONTENT'),
73+
74+
new Rule(['com'], '[\\s\\S]*?--' . $RIGHT_STRIP . '?}}', function () {
75+
$this->popState();
76+
return 'COMMENT';
77+
}),
78+
79+
new Rule(['mu'], '\\(', fn() => 'OPEN_SEXPR'),
80+
new Rule(['mu'], '\\)', fn() => 'CLOSE_SEXPR'),
81+
82+
new Rule(['mu'], '\\[', function () {
83+
// Assuming yy.syntax.square === 'string'. OPEN_ARRAY option not handled
84+
$this->rewind(strlen($this->yytext));
85+
// escaped literal
86+
$this->pushState('escl');
87+
return null;
88+
}),
89+
new Rule(['mu'], ']', fn() => 'CLOSE_ARRAY'),
90+
91+
new Rule(['mu'], '{{{{', fn() => 'OPEN_RAW_BLOCK'),
92+
new Rule(['mu'], '}}}}', function () {
93+
$this->popState();
94+
$this->pushState('raw');
95+
return 'CLOSE_RAW_BLOCK';
96+
}),
97+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?>', fn() => 'OPEN_PARTIAL'),
98+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?#>', fn() => 'OPEN_PARTIAL_BLOCK'),
99+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?#\\*?', fn() => 'OPEN_BLOCK'),
100+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?\\/', fn() => 'OPEN_ENDBLOCK'),
101+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?\\^\\s*' . $RIGHT_STRIP . '?}}', function () {
102+
$this->popState();
103+
return 'INVERSE';
104+
}),
105+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?\\s*else\\s*' . $RIGHT_STRIP . '?}}', function () {
106+
$this->popState();
107+
return 'INVERSE';
108+
}),
109+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?\\^', fn() => 'OPEN_INVERSE'),
110+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?\\s*else', fn() => 'OPEN_INVERSE_CHAIN'),
111+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?{', fn() => 'OPEN_UNESCAPED'),
112+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?&', fn() => 'OPEN'),
113+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?!--', function () {
114+
$this->rewind(strlen($this->yytext));
115+
$this->popState();
116+
$this->pushState('com');
117+
return null;
118+
}),
119+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?![\\s\\S]*?}}', function () {
120+
$this->popState();
121+
return 'COMMENT';
122+
}),
123+
new Rule(['mu'], '{{' . $LEFT_STRIP . '?\\*?', fn() => 'OPEN'),
124+
125+
new Rule(['mu'], '=', fn() => 'EQUALS'),
126+
new Rule(['mu'], '\\.\\.', fn() => 'ID'),
127+
new Rule(['mu'], '\\.(?=' . $LOOKAHEAD . ')', fn() => 'ID'),
128+
new Rule(['mu'], '\\.#', fn() => 'PRIVATE_SEP'),
129+
new Rule(['mu'], '[\\/.]', fn() => 'SEP'),
130+
new Rule(['mu'], '\\s+', fn() => null), // ignore whitespace
131+
new Rule(['mu'], '}' . $RIGHT_STRIP . '?}}', function () {
132+
$this->popState();
133+
return 'CLOSE_UNESCAPED';
134+
}),
135+
new Rule(['mu'], $RIGHT_STRIP . '?}}', function () {
136+
$this->popState();
137+
return 'CLOSE';
138+
}),
139+
// double-quoted string
140+
new Rule(['mu'], '"(\\\\["]|[^"])*"', function () {
141+
$this->strip(1, 2);
142+
$this->replace('/\\\\"/', '"');
143+
return 'STRING';
144+
}),
145+
// single quoted string
146+
new Rule(['mu'], "'(\\\\[']|[^'])*'", function () {
147+
$this->strip(1, 2);
148+
$this->replace("/\\\\'/", "'");
149+
return 'STRING';
150+
}),
151+
new Rule(['mu'], '@', fn() => 'DATA'),
152+
new Rule(['mu'], 'true(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'BOOLEAN'),
153+
new Rule(['mu'], 'false(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'BOOLEAN'),
154+
new Rule(['mu'], 'undefined(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'UNDEFINED'),
155+
new Rule(['mu'], 'null(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'NULL'),
156+
new Rule(['mu'], '\\-?[0-9]+(?:\\.[0-9]+)?(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'NUMBER'),
157+
new Rule(['mu'], 'as\\s+\\|', fn() => 'OPEN_BLOCK_PARAMS'),
158+
new Rule(['mu'], '\\|', fn() => 'CLOSE_BLOCK_PARAMS'),
159+
160+
new Rule(['mu'], $ID, fn() => 'ID'),
161+
162+
new Rule(['escl'], '\\[(\\\\\\]|[^\\]])*\\]', function () {
163+
$this->replace('/\\\\([\\\\\\]])/', '$1');
164+
$this->popState();
165+
return 'ID';
166+
}),
167+
168+
new Rule(['mu'], '.', fn() => 'INVALID'),
169+
170+
new Rule(['INITIAL', 'mu'], '\\Z', fn() => 'EOF'),
171+
]);
172+
}
173+
174+
private function strip(int $start, int $end): void
175+
{
176+
$this->yytext = substr($this->yytext, $start, strlen($this->yytext) - $end);
177+
}
178+
179+
private function replace(string $pattern, string $replacement): void
180+
{
181+
$result = preg_replace($pattern, $replacement, $this->yytext);
182+
183+
if ($result === null) {
184+
throw new \Exception('Failed to replace string: ' . preg_last_error_msg());
185+
}
186+
187+
$this->yytext = $result;
188+
}
189+
}

src/Phlexer/Phlexer.php

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars\Phlexer;
4+
5+
abstract class Phlexer
6+
{
7+
const INITIAL_STATE = 'INITIAL';
8+
9+
/**
10+
* @var string[]
11+
*/
12+
private array $states = [self::INITIAL_STATE];
13+
private string $text;
14+
private int $cursor = 0;
15+
16+
/**
17+
* The current matched value
18+
*/
19+
protected string $yytext = '';
20+
21+
/**
22+
* @param Rule[] $rules
23+
*/
24+
public function __construct(protected array $rules) {}
25+
26+
/**
27+
* @return Token[]
28+
*/
29+
public function tokenize(string $text): array
30+
{
31+
$this->text = $text;
32+
$tokens = [];
33+
34+
while ($token = $this->getNextToken()) {
35+
$tokens[] = $token;
36+
}
37+
38+
return $tokens;
39+
}
40+
41+
public function hasMoreTokens(): bool
42+
{
43+
return $this->cursor < strlen($this->text);
44+
}
45+
46+
public function getNextToken(): ?Token
47+
{
48+
if (!$this->hasMoreTokens()) {
49+
return null;
50+
}
51+
52+
foreach ($this->rules as $rule) {
53+
if (!$rule->hasStartCondition($this->topState())) {
54+
continue;
55+
}
56+
57+
$subject = substr($this->text, $this->cursor);
58+
59+
if (preg_match("/\\A{$rule->pattern}/", $subject, $matches)) {
60+
$this->yytext = $matches[0];
61+
$this->cursor += strlen($this->yytext);
62+
$tokenName = ($rule->handler)();
63+
64+
if ($tokenName === null) {
65+
// skip token - e.g. whitespace or changing state
66+
return $this->getNextToken();
67+
}
68+
69+
return new Token($tokenName, $this->yytext);
70+
}
71+
}
72+
73+
throw new \Exception("Unexpected token: \"{$this->text[0]}\"");
74+
}
75+
76+
protected function pushState(string $state): void
77+
{
78+
$this->states[] = $state;
79+
}
80+
81+
protected function popState(): void
82+
{
83+
array_pop($this->states);
84+
}
85+
86+
protected function topState(): string
87+
{
88+
$lastKey = array_key_last($this->states);
89+
90+
if ($lastKey === null) {
91+
return self::INITIAL_STATE;
92+
}
93+
94+
return $this->states[$lastKey];
95+
}
96+
97+
protected function rewind(int $length): void
98+
{
99+
$this->cursor -= $length;
100+
}
101+
}

src/Phlexer/Rule.php

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars\Phlexer;
4+
5+
readonly class Rule
6+
{
7+
/**
8+
* @var string[]
9+
*/
10+
public array $startConditions;
11+
12+
/**
13+
* @param string[] $startConditions,
14+
* @param \Closure(): ?string $handler
15+
*/
16+
public function __construct(
17+
array $startConditions,
18+
public string $pattern,
19+
public \Closure $handler,
20+
) {
21+
$this->startConditions = $startConditions ?: [Phlexer::INITIAL_STATE];
22+
}
23+
24+
public function hasStartCondition(string $condition): bool
25+
{
26+
return in_array($condition, $this->startConditions, true);
27+
}
28+
}

src/Phlexer/Token.php

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars\Phlexer;
4+
5+
readonly class Token
6+
{
7+
public function __construct(
8+
public string $name,
9+
public string $text,
10+
) {}
11+
}

test/LexerTest.php

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars\Test;
4+
5+
use DevTheorem\Handlebars\Lexer;
6+
use DevTheorem\Handlebars\Phlexer\Token;
7+
use PHPUnit\Framework\Attributes\DataProvider;
8+
use PHPUnit\Framework\TestCase;
9+
10+
class LexerTest extends TestCase
11+
{
12+
public static function jsonSpecProvider(): array
13+
{
14+
$filename = 'vendor/jbboehr/handlebars-spec/spec/tokenizer.json';
15+
$json = json_decode(file_get_contents($filename), true);
16+
return array_map(fn(array $d): array => [$d], $json);
17+
}
18+
19+
#[DataProvider("jsonSpecProvider")]
20+
public function testSpecs(array $spec): void
21+
{
22+
// fix invalid expectations
23+
if ($spec['it'] === 'does not time out in a mustache with a single } followed by EOF') {
24+
$spec['expected'][] = ['name' => 'INVALID', 'text' => '}'];
25+
} elseif ($spec['it'] === 'does not time out in a mustache when invalid ID characters are used') {
26+
$spec['expected'][] = ['name' => 'INVALID', 'text' => '&'];
27+
$spec['expected'][] = ['name' => 'CLOSE', 'text' => '}}'];
28+
}
29+
30+
$lexer = new Lexer();
31+
$toJson = fn(Token $t) => ['name' => $t->name, 'text' => $t->text];
32+
$actual = array_map($toJson, $lexer->tokenize($spec['template']));
33+
$this->assertSame($spec['expected'], $actual);
34+
}
35+
}

0 commit comments

Comments
 (0)