Files
nodejs.esm-library/experiments/generic-parser-2.mjs

146 lines
3.0 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { Row_Based_Table } from '@efforting.tech/table';
import { load_raster_table } from '@efforting.tech/table/raster-table';
import { RegExp_Tokenizer } from '@efforting.tech/parsing/regexp-dispatch';
import { RegExp_Token_Parsing_Rule, Parser } from '@efforting.tech/parsing/generic-parsing';
function load_table(raster) {
const table = load_raster_table(raster, Row_Based_Table);
table.replace_all_cells(({cell}) => cell.trim());
return table;
}
const logic_ops = load_table(`
name symbol
---- ------
AND ∧
OR
XOR ⊕
NAND ↑
NOR ↓
XNOR ⊙
IMPLIES →
IFF ↔
NOT ¬
`);
const generic_ops = load_table(`
name symbol
---- ------
PLUS +
HYPHEN -
DOT ·
ASTERISK *
CROSS ×
SLASH /
CARET ^
UNDERSCORE _
PERCENT %
`);
const punctuation = load_table(`
name symbol
---- ------
COMMA ,
SEMI_COLON ;
COLON :
PERIOD .
`);
const grouping = load_table(`
name left right
---- ---- -----
PARENTESIS ( )
SQUARE_BRACKET [ ]
CURLY_BRACE { }
ANGLE_BRACKET ⟨ ⟩
DOUBLE_ARROW_BRACKET « »
`);
const greek_chars = load_table(`
name lower upper
---- ----- -----
ALPHA α Α
BETA β Β
GAMMA γ Γ
DELTA δ Δ
EPSILON ε Ε
ZETA ζ Ζ
ETA η Η
THETA θ Θ
IOTA ι Ι
KAPPA κ Κ
LAMBDA λ Λ
MU μ Μ
NU ν Ν
XI ξ Ξ
OMICRON ο Ο
PI π Π
RHO ρ Ρ
SIGMA σ Σ
TAU τ Τ
UPSILON υ Υ
PHI φ Φ
CHI χ Χ
PSI ψ Ψ
OMEGA ω Ω
`);
const rt = new RegExp_Tokenizer();
for (const { name, left, right } of grouping.iter_objects()) {
rt.add_rules(new RegExp_Token_Parsing_Rule(RegExp.escape(left),
(tokenizer, ingress_match) => tokenizer.enter_sub_tokenizer(undefined,
(tokenizer, value, egress_match) => tokenizer.push_token(
{kind: 'EXPR', name, value, ingress_match, egress_match}
)
), `LEFT_${name}`
));
rt.add_rules(new RegExp_Token_Parsing_Rule(RegExp.escape(right),
(tokenizer, match) => tokenizer.leave_sub_tokenizer(match), `RIGHT_${name}`)
);
}
for (const table of [logic_ops, generic_ops, punctuation]) {
for (const { name, symbol } of table.iter_objects()) {
rt.add_rules(new RegExp_Token_Parsing_Rule(RegExp.escape(symbol),
(tokenizer, match) => tokenizer.push_token({ kind: 'TOKEN', match }), name)
);
}
}
for (const { name, lower, upper } of greek_chars.iter_objects()) {
rt.add_rules(new RegExp_Token_Parsing_Rule(RegExp.escape(lower),
(tokenizer, match) => tokenizer.push_token({ kind: 'TOKEN', match }), `LOWER_${name}`)
);
rt.add_rules(new RegExp_Token_Parsing_Rule(RegExp.escape(upper),
(tokenizer, match) => tokenizer.push_token({ kind: 'TOKEN', match }), `UPPER_${name}`)
);
}
rt.add_rules(new RegExp_Token_Parsing_Rule(/\w+/, (tokenizer, match) => tokenizer.push_token({ kind: 'TOKEN', match }), 'WORD'));
rt.add_rules(new RegExp_Token_Parsing_Rule(/\s+/, null, 'WHITESPACE'));
const text = 'Hello World (how are you (doing)) I may ask';
const p = new Parser(text, { tokenizer: rt });
//console.log(rt.rules.at(-3));
console.log(p.parse())