215 lines
6.8 KiB
JavaScript
215 lines
6.8 KiB
JavaScript
import * as RE from '@efforting.tech/text/regexp';
|
|
import { Tokenization_Error } from '@efforting.tech/errors';
|
|
|
|
|
|
// NOTE: There are some open questions about this implementation and API which may change as the library matures.
|
|
// Check out the example at experiments/regexp-tokenizer.mjs for more information on how to use this in its current state.
|
|
//
|
|
// Specifically it is not currently decided where the boundary between rule/action/capture should be
|
|
|
|
|
|
function normalize_bounds(text, start_position, end_position) {
|
|
const len = text.length;
|
|
const norm_start = start_position < 0 ? Math.max(0, len + start_position) : start_position;
|
|
const norm_end = end_position == undefined ? undefined : (end_position < 0 ? Math.max(0, len + end_position) : end_position);
|
|
return [norm_start, norm_end];
|
|
}
|
|
|
|
|
|
export class Pattern_Match {
|
|
constructor(text, start_position, end_position, match, rule) {
|
|
// Normalize positions
|
|
[start_position, end_position] = normalize_bounds(text, start_position, end_position);
|
|
Object.assign(this, { text, start_position, end_position, match, rule });
|
|
}
|
|
|
|
get identifier() {
|
|
return this.rule.identifier;
|
|
}
|
|
|
|
get value() {
|
|
return this.match[0];
|
|
}
|
|
|
|
get captured() {
|
|
return this.match.slice(1);
|
|
}
|
|
|
|
get absolute_start() {
|
|
return this.match.index + this.start_position;
|
|
}
|
|
|
|
get absolute_end() {
|
|
return this.match.index + this.start_position + this.match[0].length - 1;
|
|
}
|
|
|
|
get pending_index() {
|
|
return this.match.index + this.start_position + this.match[0].length;
|
|
}
|
|
}
|
|
|
|
export class Default_Match {
|
|
//TBD: Here we invoke action while creating this object, and assign the identifier but we don't do that on Pattern_Match - this feels a bit sketchy
|
|
constructor(text, start_position, end_position, match_start, match_end, value, action) {
|
|
// Normalize positions
|
|
[start_position, end_position] = normalize_bounds(text, start_position, end_position);
|
|
[match_start, match_end] = normalize_bounds(text, match_start, match_end);
|
|
|
|
const identifier = action(this); //TODO: action protocol in accordance with issue #5
|
|
Object.assign(this, { text, start_position, end_position, match_start, match_end, value, action, identifier });
|
|
}
|
|
|
|
get pending_index() {
|
|
// CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
|
|
if (this.match_end != undefined) {
|
|
return this.match_end + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
export class Abstract_RegExp_Token_Rule {
|
|
|
|
constructor(pattern) {
|
|
|
|
const pattern_source = RE.get_source(pattern);
|
|
const pattern_flags = RE.get_flags(pattern);
|
|
|
|
const immediate_flags = String.prototype.concat(...(new Set([...pattern_flags, 'y'])));
|
|
const scanning_flags = String.prototype.concat(...(new Set([...pattern_flags, 'g'])));
|
|
|
|
const immediate_pattern = new RegExp(pattern_source, immediate_flags);
|
|
const scanning_pattern = new RegExp(pattern_source, scanning_flags);
|
|
|
|
Object.assign(this, { pattern, immediate_pattern, scanning_pattern });
|
|
}
|
|
|
|
}
|
|
|
|
export class RegExp_Token_Rule extends Abstract_RegExp_Token_Rule {
|
|
constructor(pattern, identifier=undefined) {
|
|
super(pattern);
|
|
Object.assign(this, { identifier: identifier ?? this });
|
|
}
|
|
}
|
|
|
|
|
|
export class RegExp_Tokenizer {
|
|
constructor(rules=[], default_action=undefined) {
|
|
Object.assign(this, { rules, default_action });
|
|
}
|
|
|
|
set_default_identifier(identifier) {
|
|
this.default_action = (
|
|
() => identifier
|
|
);
|
|
}
|
|
|
|
add_rules(...rules_to_add) {
|
|
this.rules.push(...rules_to_add);
|
|
}
|
|
|
|
immediate_match(text, start_position=0, end_position=undefined) {
|
|
// CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
|
|
const bounded = start_position !== 0 || end_position != undefined;
|
|
const text_to_search = bounded ? text.slice(start_position, end_position) : text;
|
|
|
|
for (const rule of this.rules) {
|
|
const pattern = rule.immediate_pattern;
|
|
pattern.lastIndex = 0;
|
|
const match = pattern.exec(text_to_search);
|
|
if (match) {
|
|
return new Pattern_Match(text, start_position, end_position, match, rule);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
_handle_default_match(text, start_position, end_position, match_start, match_end, value) {
|
|
const { default_action } = this;
|
|
if (!default_action) {
|
|
throw new Tokenization_Error({ parser: this, text, start_position, end_position, match_start, match_end, value });
|
|
}
|
|
return new Default_Match(text, start_position, end_position, match_start, match_end, value, default_action);
|
|
}
|
|
|
|
|
|
closest_scanning_match(text, start_position=0, end_position=undefined) {
|
|
|
|
const immediate_match = this.immediate_match(text, start_position, end_position);
|
|
if (immediate_match) {
|
|
return immediate_match;
|
|
}
|
|
|
|
let best_candidate;
|
|
for (const candidate of this.iter_scanning_rule_candidates(text, start_position, end_position)) {
|
|
if ((best_candidate === undefined) || (best_candidate.absolute_start > candidate.absolute_start)) {
|
|
best_candidate = candidate;
|
|
}
|
|
}
|
|
|
|
// There was no match, just get the tail
|
|
if (!best_candidate) {
|
|
const tail = text.slice(start_position);
|
|
if (tail.length) {
|
|
return this._handle_default_match(text, start_position, end_position, start_position, end_position, tail);
|
|
}
|
|
}
|
|
|
|
// There was a match, check the head
|
|
if (best_candidate) {
|
|
const head = text.slice(start_position, best_candidate.absolute_start);
|
|
if (head.length) {
|
|
return this._handle_default_match(text, start_position, end_position, start_position, best_candidate.absolute_start - 1, head);
|
|
}
|
|
}
|
|
|
|
return best_candidate;
|
|
|
|
}
|
|
|
|
|
|
*iter_scanning_rule_candidates(text, start_position=0, end_position=undefined) {
|
|
// CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
|
|
const bounded = start_position !== 0 || end_position != undefined;
|
|
const text_to_search = bounded ? text.slice(start_position, end_position) : text;
|
|
|
|
|
|
// Iterates over all rules and yields any matches found anywhere (but only once per rule)
|
|
for (const rule of this.rules) {
|
|
const pattern = rule.scanning_pattern;
|
|
pattern.lastIndex = 0;
|
|
const match = pattern.exec(text_to_search);
|
|
|
|
if (match) {
|
|
yield new Pattern_Match(text, start_position, end_position, match, rule);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
*iter_matches(text, start_position=0, end_position=undefined) {
|
|
|
|
// Normalize positions
|
|
[start_position, end_position] = normalize_bounds(text, start_position, end_position);
|
|
|
|
while (true) {
|
|
const pending = this.closest_scanning_match(text, start_position, end_position);
|
|
if (pending) {
|
|
yield pending;
|
|
}
|
|
// CLARIFICATION: loose equality ( == ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
|
|
if (!pending || pending.pending_index == null || pending.pending_index === end_position ) {
|
|
break;
|
|
}
|
|
start_position = pending.pending_index;
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|