diff --git a/experiments/regexp-tokenizer.mjs b/experiments/regexp-tokenizer.mjs index eb34226..5d27624 100644 --- a/experiments/regexp-tokenizer.mjs +++ b/experiments/regexp-tokenizer.mjs @@ -12,4 +12,15 @@ rt.set_default_identifier('random stuff'); for (const m of rt.iter_matches('#Hello World!')) { console.log({class: m.constructor.name, identifier: m.identifier, value: m.value, captured: m.captured }); -}; \ No newline at end of file +}; + + +console.log('--=| Slicing |=--') + +for (const m of rt.iter_matches('#Hello World!', 3, -3)) { + //console.log(m, m.pending_index) + console.log({class: m.constructor.name, identifier: m.identifier, value: m.value, captured: m.captured }); + + +}; + diff --git a/source/errors.mjs b/source/errors.mjs index bdc1f72..4d8975b 100644 --- a/source/errors.mjs +++ b/source/errors.mjs @@ -6,7 +6,7 @@ import { inspect } from 'node:util'; export class Tokenization_Error extends Error { constructor(data) { - const { parser, value, index, end_index } = data; + const {parser, text, start_position, end_position, match_start, match_end, value} = data; super(`Tokenization_Error`); //TODO: Format message this.data = data; } diff --git a/source/parsing/regexp-dispatch.mjs b/source/parsing/regexp-dispatch.mjs index ceac4bd..e7bc0a3 100644 --- a/source/parsing/regexp-dispatch.mjs +++ b/source/parsing/regexp-dispatch.mjs @@ -7,13 +7,24 @@ import { Tokenization_Error } from '@efforting.tech/errors'; // // Specifically it is not currently decided where the boundary between rule/action/capture should be + +function normalize_bounds(text, start_position, end_position) { + const len = text.length; + const norm_start = start_position < 0 ? Math.max(0, len + start_position) : start_position; + const norm_end = end_position == undefined ? undefined : (end_position < 0 ? Math.max(0, len + end_position) : end_position); + return [norm_start, norm_end]; +} + + export class Pattern_Match { - constructor(match, rule) { - Object.assign(this, { match, rule }); + constructor(text, start_position, end_position, match, rule) { + // Normalize positions + [start_position, end_position] = normalize_bounds(text, start_position, end_position); + Object.assign(this, { text, start_position, end_position, match, rule }); } get identifier() { - return this.rule. identifier; + return this.rule.identifier; } get value() { @@ -24,28 +35,34 @@ export class Pattern_Match { return this.match.slice(1); } - get pending_index() { - return this.match.index + this.match[0].length; + get absolute_start() { + return this.match.index + this.start_position; } + get absolute_end() { + return this.match.index + this.start_position + this.match[0].length - 1; + } + + get pending_index() { + return this.match.index + this.start_position + this.match[0].length; + } } export class Default_Match { - constructor(text, index, end_index, action) { - const identifier = action(this); - Object.assign(this, { text, index, end_index, action, identifier }); - } + //TBD: Here we invoke action while creating this object, and assign the identifier but we don't do that on Pattern_Match - this feels a bit sketchy + constructor(text, start_position, end_position, match_start, match_end, value, action) { + // Normalize positions + [start_position, end_position] = normalize_bounds(text, start_position, end_position); + [match_start, match_end] = normalize_bounds(text, match_start, match_end); - get value() { - return this.text; + const identifier = action(this); //TODO: action protocol in accordance with issue #5 + Object.assign(this, { text, start_position, end_position, match_start, match_end, value, action, identifier }); } - get pending_index() { - if (this.end_index === null) { - return null; - } else { - return this.end_index; + // CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number + if (this.match_end != undefined) { + return this.match_end + 1; } } } @@ -76,8 +93,6 @@ export class RegExp_Token_Rule extends Abstract_RegExp_Token_Rule { } } -// Note: There is no clean built in way to set an end position of a RegExp pattern, the only generic way is to slice the string we match before. -// We may at some point implement support for this (and it would only be done if end position was given) export class RegExp_Tokenizer { constructor(rules=[], default_action=undefined) { @@ -94,53 +109,58 @@ export class RegExp_Tokenizer { this.rules.push(...rules_to_add); } - immediate_match(text, position=0) { + immediate_match(text, start_position=0, end_position=undefined) { + // CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number + const bounded = start_position !== 0 || end_position != undefined; + const text_to_search = bounded ? text.slice(start_position, end_position) : text; + for (const rule of this.rules) { const pattern = rule.immediate_pattern; - pattern.lastIndex = position; - const match = pattern.exec(text); + pattern.lastIndex = 0; + const match = pattern.exec(text_to_search); if (match) { - return new Pattern_Match(match, rule); + return new Pattern_Match(text, start_position, end_position, match, rule); } } } - _handle_default_match(value, index, end_index=null) { + _handle_default_match(text, start_position, end_position, match_start, match_end, value) { const { default_action } = this; if (!default_action) { - throw new Tokenization_Error({ parser: this, value, index, end_index }); + throw new Tokenization_Error({ parser: this, text, start_position, end_position, match_start, match_end, value }); } - return new Default_Match(value, index, end_index, default_action); + return new Default_Match(text, start_position, end_position, match_start, match_end, value, default_action); } - closest_scanning_match(text, position=0) { - const immediate_match = this.immediate_match(text, position); + closest_scanning_match(text, start_position=0, end_position=undefined) { + + const immediate_match = this.immediate_match(text, start_position, end_position); if (immediate_match) { return immediate_match; } let best_candidate; - for (const candidate of this.iter_scanning_rule_candidates(text, position)) { - if ((best_candidate === undefined) || (best_candidate.match.index > candidate.match.index)) { + for (const candidate of this.iter_scanning_rule_candidates(text, start_position, end_position)) { + if ((best_candidate === undefined) || (best_candidate.absolute_start > candidate.absolute_start)) { best_candidate = candidate; } } // There was no match, just get the tail if (!best_candidate) { - const tail = text.slice(position); + const tail = text.slice(start_position); if (tail.length) { - return this._handle_default_match(tail, position); + return this._handle_default_match(text, start_position, end_position, start_position, end_position, tail); } } // There was a match, check the head if (best_candidate) { - const head = text.slice(position, best_candidate.match.index); + const head = text.slice(start_position, best_candidate.absolute_start); if (head.length) { - return this._handle_default_match(head, position, best_candidate.match.index); + return this._handle_default_match(text, start_position, end_position, start_position, best_candidate.absolute_start - 1, head); } } @@ -149,32 +169,41 @@ export class RegExp_Tokenizer { } - *iter_scanning_rule_candidates(text, position=0) { - // Iterates over all rules and yields any matches found anywhere (but only once per rule) + *iter_scanning_rule_candidates(text, start_position=0, end_position=undefined) { + // CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number + const bounded = start_position !== 0 || end_position != undefined; + const text_to_search = bounded ? text.slice(start_position, end_position) : text; + + // Iterates over all rules and yields any matches found anywhere (but only once per rule) for (const rule of this.rules) { const pattern = rule.scanning_pattern; - pattern.lastIndex = position; - const match = pattern.exec(text); + pattern.lastIndex = 0; + const match = pattern.exec(text_to_search); if (match) { - yield new Pattern_Match(match, rule); + yield new Pattern_Match(text, start_position, end_position, match, rule); } } } - *iter_matches(text, position=0) { + *iter_matches(text, start_position=0, end_position=undefined) { + + // Normalize positions + [start_position, end_position] = normalize_bounds(text, start_position, end_position); + while (true) { - const pending = this.closest_scanning_match(text, position); + const pending = this.closest_scanning_match(text, start_position, end_position); if (pending) { yield pending; } - if (!pending || pending.pending_index === null) { + // CLARIFICATION: loose equality ( == ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number + if (!pending || pending.pending_index == null || pending.pending_index === end_position ) { break; } - position = pending.pending_index; + start_position = pending.pending_index; } }