=encoding utf8
=head1 NAME
std/path/z/lexer - Pure Zuzu lexer for ZPath expressions.
=head1 IMPLEMENTATION SUPPORT
This module is supported by all implementations of ZuzuScript.
=head1 DESCRIPTION
This module provides the pure-Zuzu lexer used by ZPath.
=head1 EXPORTS
=head2 Classes
=over
=item C<< Lexer({ src: String, allowed_operators: Array }) >>
Constructs a ZPath lexer. Returns: C<Lexer>. Tokenizes C<src> using the
allowed operator table.
=over
=item C<< lexer.peek() >>
Parameters: none. Returns: C<Dict>. Returns the current token without
advancing.
=item C<< lexer.peek_n(n) >>
Parameters: C<n> is a zero-based lookahead offset. Returns: C<Dict>.
Returns a token ahead of the current position.
=item C<< lexer.peek_kind() >>
Parameters: none. Returns: C<String>. Returns the current token kind.
=item C<< lexer.peek_kind_n(n) >>
Parameters: C<n> is a zero-based lookahead offset. Returns: C<String>.
Returns a token kind ahead of the current position.
=item C<< lexer.next_tok() >>
Parameters: none. Returns: C<Dict>. Consumes and returns the current
token.
=item C<< lexer.expect(k) >>
Parameters: C<k> is the expected token kind. Returns: C<Dict>. Consumes
and returns the current token, throwing when it has a different kind.
=item C<< lexer.known_operators() >>
Parameters: none. Returns: C<Array>. Returns the allowed operator
definitions.
=back
=back
=head1 COPYRIGHT AND LICENCE
B<< std/path/z/lexer >> is copyright Toby Inkster.
It is free software; you may redistribute it and/or modify it under
the terms of either the Artistic License 1.0 or the GNU General Public
License version 2.
=cut
from std/string import substr;
class Lexer {
let src := "";
let scan_i := 0;
let toks := [];
let pos := 0;
let allowed_operators;
method __build__ () {
die "Expected some operators" unless allowed_operators;
toks := self._tokenize(src);
}
method peek () {
return toks[pos];
}
method peek_n ( n ) {
return toks[pos + n];
}
method peek_kind () {
return toks[pos]{k};
}
method peek_kind_n ( n ) {
return toks[pos + n]{k};
}
method next_tok () {
let tok := toks[pos];
pos++;
return tok;
}
method expect ( k ) {
let t := self.next_tok();
die `Expected ${k}, got ${t{k}}` if t{k} ≢ k;
return t;
}
method _is_ws ( c ) {
return c ≢ null and c ~ /\s/;
}
method _prev_sig ( chars, idx ) {
let j := idx - 1;
while ( j >= 0 ) {
if ( chars[j] ~ /\s/ ) {
j--;
next;
}
return chars[j];
}
return null;
}
method _next_sig ( chars, idx, n ) {
let j := idx + 1;
while ( j < n ) {
if ( chars[j] ~ /\s/ ) {
j++;
next;
}
return chars[j];
}
return null;
}
method _is_path_ctx_prev ( c ) {
return c ≡ "[" or c ≡ "(" or c ≡ "," or c ≡ ":" or c ≡ "?" or c ≡ "/";
}
method _is_path_ctx_next ( c ) {
return c ≡ "]" or c ≡ ")" or c ≡ "," or c ≡ ":" or c ≡ "?" or c ≡ "/";
}
method _ws_on_both ( left, right ) {
return self._is_ws(left) and self._is_ws(right);
}
method known_operators () {
return allowed_operators;
}
method _sorted_operators () {
return self.known_operators()
.grep( fn o → not o.lexer_should_ignore )
.sort( fn ( x, y ) → y.char_length <=> x.char_length );
}
method _operator_at ( chars, i, n, ops ) {
let oi := 0;
while ( oi < ops.length ) {
let op := ops[oi];
let spell := op{spelling};
let m := length spell;
if ( i + m <= n ) {
let got := "";
let j := 0;
while ( j < m ) {
got _= chars[i + j];
j++;
}
if ( got ≡ spell ) {
return op;
}
}
oi++;
}
return null;
}
method _is_name_char ( c ) {
return c ~ /[A-Za-z0-9_\-]/;
}
method _allows_operator_kind ( kind ) {
return allowed_operators.first( fn op → op.get_kind() ≡ kind )
? true
: false;
}
method _tokenize ( String source ) {
let t := [];
let chars := [];
let n := length source;
let ops := self._sorted_operators();
let z := 0;
while ( z < n ) {
chars.push( substr( source, z, 1 ) );
z++;
}
let i := 0;
while ( i < n ) {
let ch := chars[i];
if ( ch ~ /\s/ ) {
i++;
next;
}
let prev := i > 0 ? chars[i - 1] : null;
let nxt := i + 1 < n ? chars[i + 1] : null;
let prev_nonws := self._prev_sig( chars, i );
let next_nonws := self._next_sig( chars, i, n );
function push_token ( tok ) {
tok{ws_before} := self._is_ws(prev) ? true : false;
tok{ws_after} := self._is_ws(nxt) ? true : false;
t.push( tok );
}
function push_sized_token ( tok, after ) {
tok{ws_before} := self._is_ws(prev) ? true : false;
tok{ws_after} := self._is_ws(after) ? true : false;
t.push( tok );
}
if ( ch ≡ "/" ) {
if (
self._ws_on_both( prev, nxt )
and prev_nonws ≢ null and next_nonws ≢ null
and not self._is_path_ctx_prev(prev_nonws)
and not self._is_path_ctx_next(next_nonws)
) {
push_token( { k: "SLASH", v: "/" } );
i++;
next;
}
if (
( self._is_ws(prev) xor self._is_ws(nxt) )
and prev_nonws ≢ null and next_nonws ≢ null
and not self._is_path_ctx_prev(prev_nonws)
and not self._is_path_ctx_next(next_nonws)
) {
die `Binary operator '/' requires whitespace around it`;
}
t.push( { k: "SLASH_PATH", v: "/" } );
i++;
next;
}
let op := self._operator_at( chars, i, n, ops );
if ( op ≢ null ) {
let spell := op.get_spelling();
let need_ws := op.requires_whitespace();
let m := length spell;
let op_prev := i > 0 ? chars[i - 1] : null;
let op_next := i + m < n ? chars[i + m] : null;
let path_ambiguous_star :=
( spell ≡ "*" or spell ≡ "**" )
and not self._ws_on_both( op_prev, op_next );
let left_name := false;
let right_name := false;
if ( spell ~ /^[A-Za-z_]/ ) {
left_name := op_prev ≢ null and self._is_name_char(op_prev);
right_name := op_next ≢ null and self._is_name_char(op_next);
}
if (
not path_ambiguous_star
and not left_name
and not right_name
) {
if ( need_ws ) {
if ( self._ws_on_both( op_prev, op_next ) ) {
push_sized_token( { k: op.get_kind(), v: spell }, op_next );
i := i + m;
next;
}
die `Binary operator '${spell}' requires whitespace around it`;
}
push_sized_token( { k: op.get_kind(), v: spell }, op_next );
i := i + m;
next;
}
}
if ( ch ≡ "(" ) { t.push( { k: "LPAREN", v: "(" } ); i++; next; }
if ( ch ≡ ")" ) { t.push( { k: "RPAREN", v: ")" } ); i++; next; }
if ( ch ≡ "[" ) { t.push( { k: "LBRACK", v: "[" } ); i++; next; }
if ( ch ≡ "]" ) { t.push( { k: "RBRACK", v: "]" } ); i++; next; }
if ( ch ≡ "," ) { t.push( { k: "COMMA", v: "," } ); i++; next; }
if ( ch ≡ "." ) {
if ( i + 2 < n and chars[i + 1] ≡ "." and chars[i + 2] ≡ "*" ) {
push_token( { k: "DOTDOTSTAR", v: "..*" } );
i := i + 3;
next;
}
if ( i + 1 < n and chars[i + 1] ≡ "." ) {
push_token( { k: "DOTDOT", v: ".." } );
i := i + 2;
next;
}
push_token( { k: "DOT", v: "." } );
i++;
next;
}
if ( ch ≡ "*" and self._ws_on_both( prev, nxt ) ) {
push_token( { k: "STAR", v: "*" } );
i++;
next;
}
if ( ch ≡ "*" ) {
if ( i + 1 < n and chars[i + 1] ≡ "*" ) {
push_token( { k: "STARSTAR", v: "**" } );
i := i + 2;
next;
}
push_token( { k: "STAR_PATH", v: "*" } );
i++;
next;
}
if ( self._allows_operator_kind( "ELVIS" ) and ch ≡ "?" and nxt ≡ ":" ) {
let op_next := i + 2 < n ? chars[i + 2] : null;
if ( self._ws_on_both( prev, op_next ) ) {
t.push( {
k: "ELVIS",
v: "?:",
ws_before: self._is_ws(prev) ? true : false,
ws_after: self._is_ws(op_next) ? true : false,
} );
i := i + 2;
next;
}
die "Elvis operator '?:' requires whitespace around it";
}
if ( ch ≡ "?" or ch ≡ ":" ) {
if ( self._ws_on_both( prev, nxt ) ) {
push_token( { k: ch ≡ "?" ? "QMARK" : "COLON", v: ch } );
i++;
next;
}
die `Ternary operator '${ch}' requires whitespace around it`;
}
if ( ch ≡ "\"" or ch ≡ "'" ) {
let quote := ch;
i++;
let s := "";
let esc := false;
while ( i < n ) {
let cc := chars[i];
i++;
if ( esc ) {
if ( cc ≡ "\\" or cc ≡ quote or cc ≡ "\"" or cc ≡ "'" ) {
s _= cc;
}
else {
s _= self._unescape_char(cc);
}
esc := false;
next;
}
if ( cc ≡ "\\" ) { esc := true; next; }
last if cc ≡ quote;
s _= cc;
}
push_token( { k: "STRING", v: s, q: quote } );
next;
}
if ( ch ≡ "#" ) {
let j := i + 1;
let neg := false;
if ( j < n and chars[j] ≡ "-" ) {
neg := true;
j++;
}
die "Invalid index '#'" if j >= n or not( chars[j] ~ /\d/ );
let num := "";
while ( j < n and chars[j] ~ /\d/ ) {
num _= chars[j];
j++;
}
let parsed := 0 + num;
parsed := 0 - parsed if neg;
push_token( { k: "INDEX", v: parsed } );
i := j;
next;
}
if ( ch ~ /[0-9]/ ) {
let j := i;
let num := "";
while ( j < n and chars[j] ~ /[0-9.]/ ) {
num _= chars[j];
j++;
}
push_token( { k: "NUMBER", v: 0 + num } );
i := j;
next;
}
let name := self._read_name( chars, i );
if ( name{v} ≢ "" ) {
push_token( { k: "NAME", v: name{v} } );
i := name{i};
next;
}
die `Unexpected character '${ch}' at position ${i}`;
}
t.push( { k: "EOF", v: "" } );
return t;
}
method _unescape_char ( c ) {
return "\n" if c ≡ "n";
return "\r" if c ≡ "r";
return "\t" if c ≡ "t";
return c;
}
method _read_name ( chars, start_i ) {
let n := chars.length();
let delim := {
"\n": true,
"\r": true,
"\t": true,
"(": true,
")": true,
"[": true,
"]": true,
"/": true,
",": true,
"=": true,
"&": true,
"|": true,
"!": true,
"<": true,
">": true,
"#": true,
" ": true,
};
let buf := "";
let esc := false;
let i := start_i;
while ( i < n ) {
let c := chars[i];
if ( esc ) {
buf _= c;
esc := false;
i++;
next;
}
if ( c ≡ "\\" ) {
esc := true;
i++;
next;
}
last if delim.exists(c);
last if c ~ /\s/;
buf _= c;
i++;
}
return { v: "", i: start_i } if buf ≡ "";
return { v: buf, i: i };
}
}
std/path/z/lexer
Standard Library source code
Pure Zuzu lexer for ZPath expressions.
Module
- Name
std/path/z/lexer- Area
- Standard Library
- Source
modules/std/path/z/lexer.zzm