Added dev target in makefile, added regexp-dispatch and example
This commit is contained in:
180
source/parsing/regexp-dispatch.mjs
Normal file
180
source/parsing/regexp-dispatch.mjs
Normal file
@@ -0,0 +1,180 @@
|
||||
import * as RE from '@efforting.tech/text/regexp';
|
||||
|
||||
// NOTE: There are some open questions about this implementation and API which may change as the library matures.
|
||||
// Check out the example at experiments/regexp-tokenizer.mjs for more information on how to use this in its current state.
|
||||
//
|
||||
// Specifically it is not currently decided where the boundary between rule/action/capture should be
|
||||
|
||||
export class Pattern_Match {
|
||||
constructor(match, rule) {
|
||||
Object.assign(this, { match, rule });
|
||||
}
|
||||
|
||||
get identifier() {
|
||||
return this.rule. identifier;
|
||||
}
|
||||
|
||||
get value() {
|
||||
return this.match[0];
|
||||
}
|
||||
|
||||
get captured() {
|
||||
return this.match.slice(1);
|
||||
}
|
||||
|
||||
get pending_index() {
|
||||
return this.match.index + this.match[0].length;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export class Default_Match {
|
||||
constructor(text, index, end_index, action) {
|
||||
const identifier = action(this);
|
||||
Object.assign(this, { text, index, end_index, action, identifier });
|
||||
}
|
||||
|
||||
get value() {
|
||||
return this.text;
|
||||
}
|
||||
|
||||
|
||||
get pending_index() {
|
||||
if (this.end_index === null) {
|
||||
return null;
|
||||
} else {
|
||||
return this.end_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export class Abstract_RegExp_Token_Rule {
|
||||
|
||||
constructor(pattern) {
|
||||
|
||||
const pattern_source = RE.get_source(pattern);
|
||||
const pattern_flags = RE.get_flags(pattern);
|
||||
|
||||
const immediate_flags = String.prototype.concat(...(new Set([...pattern_flags, 'y'])));
|
||||
const scanning_flags = String.prototype.concat(...(new Set([...pattern_flags, 'g'])));
|
||||
|
||||
const immediate_pattern = new RegExp(pattern_source, immediate_flags);
|
||||
const scanning_pattern = new RegExp(pattern_source, scanning_flags);
|
||||
|
||||
Object.assign(this, { pattern, immediate_pattern, scanning_pattern });
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export class RegExp_Token_Rule extends Abstract_RegExp_Token_Rule {
|
||||
constructor(pattern, identifier=this) {
|
||||
super(pattern);
|
||||
Object.assign(this, { identifier });
|
||||
}
|
||||
}
|
||||
|
||||
export class RegExp_Tokenizer {
|
||||
constructor(rules=[], default_action=undefined) {
|
||||
Object.assign(this, { rules, default_action });
|
||||
}
|
||||
|
||||
set_default_identifier(identifier) {
|
||||
this.default_action = (
|
||||
() => identifier
|
||||
);
|
||||
}
|
||||
|
||||
add_rules(...rules_to_add) {
|
||||
this.rules.push(...rules_to_add);
|
||||
}
|
||||
|
||||
immediate_match(text, position=0) {
|
||||
for (const rule of this.rules) {
|
||||
const pattern = rule.immediate_pattern;
|
||||
pattern.lastIndex = position;
|
||||
const match = pattern.exec(text);
|
||||
if (match) {
|
||||
return new Pattern_Match(match, rule);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
_handle_default_match(value, index, end_index=null) {
|
||||
const { default_action } = this;
|
||||
if (!default_action) {
|
||||
throw new Parsing_Error({ parser: this, value, index, end_index });
|
||||
}
|
||||
return new Default_Match(value, index, end_index, default_action);
|
||||
}
|
||||
|
||||
|
||||
closest_scanning_match(text, position=0) {
|
||||
const immediate_match = this.immediate_match(text, position);
|
||||
if (immediate_match) {
|
||||
return immediate_match;
|
||||
}
|
||||
|
||||
let best_candidate;
|
||||
for (const candidate of this.iter_scanning_rule_candidates(text, position)) {
|
||||
if ((best_candidate === undefined) || (best_candidate.match.index > candidate.match.index)) {
|
||||
best_candidate = candidate;
|
||||
}
|
||||
}
|
||||
|
||||
// There was no match, just get the tail
|
||||
if (!best_candidate) {
|
||||
const tail = text.slice(position);
|
||||
if (tail.length) {
|
||||
return this._handle_default_match(tail, position);
|
||||
}
|
||||
}
|
||||
|
||||
// There was a match, check the head
|
||||
if (best_candidate) {
|
||||
const head = text.slice(position, best_candidate.match.index);
|
||||
if (head.length) {
|
||||
return this._handle_default_match(head, position, best_candidate.match.index);
|
||||
}
|
||||
}
|
||||
|
||||
return best_candidate;
|
||||
|
||||
}
|
||||
|
||||
|
||||
*iter_scanning_rule_candidates(text, position=0) {
|
||||
// Iterates over all rules and yields any matches found anywhere (but only once per rule)
|
||||
|
||||
for (const rule of this.rules) {
|
||||
const pattern = rule.scanning_pattern;
|
||||
pattern.lastIndex = position;
|
||||
const match = pattern.exec(text);
|
||||
|
||||
if (match) {
|
||||
yield new Pattern_Match(match, rule);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
*iter_matches(text, position=0) {
|
||||
while (true) {
|
||||
const pending = this.closest_scanning_match(text, position);
|
||||
if (pending) {
|
||||
yield pending;
|
||||
}
|
||||
if (!pending || pending.pending_index === null) {
|
||||
break;
|
||||
}
|
||||
position = pending.pending_index;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user