import * as RE from '@efforting.tech/text/regexp'; import { Tokenization_Error } from '@efforting.tech/errors'; // NOTE: There are some open questions about this implementation and API which may change as the library matures. // Check out the example at experiments/regexp-tokenizer.mjs for more information on how to use this in its current state. // // Specifically it is not currently decided where the boundary between rule/action/capture should be function normalize_bounds(text, start_position, end_position) { const len = text.length; const norm_start = start_position < 0 ? Math.max(0, len + start_position) : start_position; const norm_end = end_position == undefined ? undefined : (end_position < 0 ? Math.max(0, len + end_position) : end_position); return [norm_start, norm_end]; } export class Pattern_Match { constructor(text, start_position, end_position, match, rule) { // Normalize positions [start_position, end_position] = normalize_bounds(text, start_position, end_position); Object.assign(this, { text, start_position, end_position, match, rule }); } get identifier() { return this.rule.identifier; } get value() { return this.match[0]; } get captured() { return this.match.slice(1); } get absolute_start() { return this.match.index + this.start_position; } get absolute_end() { return this.match.index + this.start_position + this.match[0].length - 1; } get pending_index() { return this.match.index + this.start_position + this.match[0].length; } } export class Default_Match { //TBD: Here we invoke action while creating this object, and assign the identifier but we don't do that on Pattern_Match - this feels a bit sketchy constructor(text, start_position, end_position, match_start, match_end, value, action) { // Normalize positions [start_position, end_position] = normalize_bounds(text, start_position, end_position); [match_start, match_end] = normalize_bounds(text, match_start, match_end); const identifier = action(this); //TODO: action protocol in accordance with issue #5 Object.assign(this, { text, start_position, end_position, match_start, match_end, value, action, identifier }); } get pending_index() { // CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number if (this.match_end != undefined) { return this.match_end + 1; } } } export class Abstract_RegExp_Token_Rule { constructor(pattern) { const pattern_source = RE.get_source(pattern); const pattern_flags = RE.get_flags(pattern); const immediate_flags = String.prototype.concat(...(new Set([...pattern_flags, 'y']))); const scanning_flags = String.prototype.concat(...(new Set([...pattern_flags, 'g']))); const immediate_pattern = new RegExp(pattern_source, immediate_flags); const scanning_pattern = new RegExp(pattern_source, scanning_flags); Object.assign(this, { pattern, immediate_pattern, scanning_pattern }); } } export class RegExp_Token_Rule extends Abstract_RegExp_Token_Rule { constructor(pattern, identifier=undefined) { super(pattern); Object.assign(this, { identifier: identifier ?? this }); } } export class RegExp_Tokenizer { constructor(rules=[], default_action=undefined) { Object.assign(this, { rules, default_action }); } set_default_identifier(identifier) { this.default_action = ( () => identifier ); } add_rules(...rules_to_add) { this.rules.push(...rules_to_add); } immediate_match(text, start_position=0, end_position=undefined) { // CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number const bounded = start_position !== 0 || end_position != undefined; const text_to_search = bounded ? text.slice(start_position, end_position) : text; for (const rule of this.rules) { const pattern = rule.immediate_pattern; pattern.lastIndex = 0; const match = pattern.exec(text_to_search); if (match) { return new Pattern_Match(text, start_position, end_position, match, rule); } } } _handle_default_match(text, start_position, end_position, match_start, match_end, value) { const { default_action } = this; if (!default_action) { throw new Tokenization_Error({ parser: this, text, start_position, end_position, match_start, match_end, value }); } return new Default_Match(text, start_position, end_position, match_start, match_end, value, default_action); } closest_scanning_match(text, start_position=0, end_position=undefined) { const immediate_match = this.immediate_match(text, start_position, end_position); if (immediate_match) { return immediate_match; } let best_candidate; for (const candidate of this.iter_scanning_rule_candidates(text, start_position, end_position)) { if ((best_candidate === undefined) || (best_candidate.absolute_start > candidate.absolute_start)) { best_candidate = candidate; } } // There was no match, just get the tail if (!best_candidate) { const tail = text.slice(start_position); if (tail.length) { return this._handle_default_match(text, start_position, end_position, start_position, end_position, tail); } } // There was a match, check the head if (best_candidate) { const head = text.slice(start_position, best_candidate.absolute_start); if (head.length) { return this._handle_default_match(text, start_position, end_position, start_position, best_candidate.absolute_start - 1, head); } } return best_candidate; } *iter_scanning_rule_candidates(text, start_position=0, end_position=undefined) { // CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number const bounded = start_position !== 0 || end_position != undefined; const text_to_search = bounded ? text.slice(start_position, end_position) : text; // Iterates over all rules and yields any matches found anywhere (but only once per rule) for (const rule of this.rules) { const pattern = rule.scanning_pattern; pattern.lastIndex = 0; const match = pattern.exec(text_to_search); if (match) { yield new Pattern_Match(text, start_position, end_position, match, rule); } } } *iter_matches(text, start_position=0, end_position=undefined) { // Normalize positions [start_position, end_position] = normalize_bounds(text, start_position, end_position); while (true) { const pending = this.closest_scanning_match(text, start_position, end_position); if (pending) { yield pending; } // CLARIFICATION: loose equality ( == ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number if (!pending || pending.pending_index == null || pending.pending_index === end_position ) { break; } start_position = pending.pending_index; } } }