Worked on #3 where I believe the regexp part is now complete but it needs testing

This commit is contained in:
2026-05-25 02:29:57 +02:00
parent 1842d3de9c
commit d584a49579
3 changed files with 84 additions and 44 deletions

View File

@@ -12,4 +12,15 @@ rt.set_default_identifier('random stuff');
for (const m of rt.iter_matches('#Hello World!')) {
console.log({class: m.constructor.name, identifier: m.identifier, value: m.value, captured: m.captured });
};
};
console.log('--=| Slicing |=--')
for (const m of rt.iter_matches('#Hello World!', 3, -3)) {
//console.log(m, m.pending_index)
console.log({class: m.constructor.name, identifier: m.identifier, value: m.value, captured: m.captured });
};

View File

@@ -6,7 +6,7 @@ import { inspect } from 'node:util';
export class Tokenization_Error extends Error {
constructor(data) {
const { parser, value, index, end_index } = data;
const {parser, text, start_position, end_position, match_start, match_end, value} = data;
super(`Tokenization_Error`); //TODO: Format message
this.data = data;
}

View File

@@ -7,13 +7,24 @@ import { Tokenization_Error } from '@efforting.tech/errors';
//
// Specifically it is not currently decided where the boundary between rule/action/capture should be
function normalize_bounds(text, start_position, end_position) {
const len = text.length;
const norm_start = start_position < 0 ? Math.max(0, len + start_position) : start_position;
const norm_end = end_position == undefined ? undefined : (end_position < 0 ? Math.max(0, len + end_position) : end_position);
return [norm_start, norm_end];
}
export class Pattern_Match {
constructor(match, rule) {
Object.assign(this, { match, rule });
constructor(text, start_position, end_position, match, rule) {
// Normalize positions
[start_position, end_position] = normalize_bounds(text, start_position, end_position);
Object.assign(this, { text, start_position, end_position, match, rule });
}
get identifier() {
return this.rule. identifier;
return this.rule.identifier;
}
get value() {
@@ -24,28 +35,34 @@ export class Pattern_Match {
return this.match.slice(1);
}
get pending_index() {
return this.match.index + this.match[0].length;
get absolute_start() {
return this.match.index + this.start_position;
}
get absolute_end() {
return this.match.index + this.start_position + this.match[0].length - 1;
}
get pending_index() {
return this.match.index + this.start_position + this.match[0].length;
}
}
export class Default_Match {
constructor(text, index, end_index, action) {
const identifier = action(this);
Object.assign(this, { text, index, end_index, action, identifier });
}
//TBD: Here we invoke action while creating this object, and assign the identifier but we don't do that on Pattern_Match - this feels a bit sketchy
constructor(text, start_position, end_position, match_start, match_end, value, action) {
// Normalize positions
[start_position, end_position] = normalize_bounds(text, start_position, end_position);
[match_start, match_end] = normalize_bounds(text, match_start, match_end);
get value() {
return this.text;
const identifier = action(this); //TODO: action protocol in accordance with issue #5
Object.assign(this, { text, start_position, end_position, match_start, match_end, value, action, identifier });
}
get pending_index() {
if (this.end_index === null) {
return null;
} else {
return this.end_index;
// CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
if (this.match_end != undefined) {
return this.match_end + 1;
}
}
}
@@ -76,8 +93,6 @@ export class RegExp_Token_Rule extends Abstract_RegExp_Token_Rule {
}
}
// Note: There is no clean built in way to set an end position of a RegExp pattern, the only generic way is to slice the string we match before.
// We may at some point implement support for this (and it would only be done if end position was given)
export class RegExp_Tokenizer {
constructor(rules=[], default_action=undefined) {
@@ -94,53 +109,58 @@ export class RegExp_Tokenizer {
this.rules.push(...rules_to_add);
}
immediate_match(text, position=0) {
immediate_match(text, start_position=0, end_position=undefined) {
// CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
const bounded = start_position !== 0 || end_position != undefined;
const text_to_search = bounded ? text.slice(start_position, end_position) : text;
for (const rule of this.rules) {
const pattern = rule.immediate_pattern;
pattern.lastIndex = position;
const match = pattern.exec(text);
pattern.lastIndex = 0;
const match = pattern.exec(text_to_search);
if (match) {
return new Pattern_Match(match, rule);
return new Pattern_Match(text, start_position, end_position, match, rule);
}
}
}
_handle_default_match(value, index, end_index=null) {
_handle_default_match(text, start_position, end_position, match_start, match_end, value) {
const { default_action } = this;
if (!default_action) {
throw new Tokenization_Error({ parser: this, value, index, end_index });
throw new Tokenization_Error({ parser: this, text, start_position, end_position, match_start, match_end, value });
}
return new Default_Match(value, index, end_index, default_action);
return new Default_Match(text, start_position, end_position, match_start, match_end, value, default_action);
}
closest_scanning_match(text, position=0) {
const immediate_match = this.immediate_match(text, position);
closest_scanning_match(text, start_position=0, end_position=undefined) {
const immediate_match = this.immediate_match(text, start_position, end_position);
if (immediate_match) {
return immediate_match;
}
let best_candidate;
for (const candidate of this.iter_scanning_rule_candidates(text, position)) {
if ((best_candidate === undefined) || (best_candidate.match.index > candidate.match.index)) {
for (const candidate of this.iter_scanning_rule_candidates(text, start_position, end_position)) {
if ((best_candidate === undefined) || (best_candidate.absolute_start > candidate.absolute_start)) {
best_candidate = candidate;
}
}
// There was no match, just get the tail
if (!best_candidate) {
const tail = text.slice(position);
const tail = text.slice(start_position);
if (tail.length) {
return this._handle_default_match(tail, position);
return this._handle_default_match(text, start_position, end_position, start_position, end_position, tail);
}
}
// There was a match, check the head
if (best_candidate) {
const head = text.slice(position, best_candidate.match.index);
const head = text.slice(start_position, best_candidate.absolute_start);
if (head.length) {
return this._handle_default_match(head, position, best_candidate.match.index);
return this._handle_default_match(text, start_position, end_position, start_position, best_candidate.absolute_start - 1, head);
}
}
@@ -149,32 +169,41 @@ export class RegExp_Tokenizer {
}
*iter_scanning_rule_candidates(text, position=0) {
// Iterates over all rules and yields any matches found anywhere (but only once per rule)
*iter_scanning_rule_candidates(text, start_position=0, end_position=undefined) {
// CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
const bounded = start_position !== 0 || end_position != undefined;
const text_to_search = bounded ? text.slice(start_position, end_position) : text;
// Iterates over all rules and yields any matches found anywhere (but only once per rule)
for (const rule of this.rules) {
const pattern = rule.scanning_pattern;
pattern.lastIndex = position;
const match = pattern.exec(text);
pattern.lastIndex = 0;
const match = pattern.exec(text_to_search);
if (match) {
yield new Pattern_Match(match, rule);
yield new Pattern_Match(text, start_position, end_position, match, rule);
}
}
}
*iter_matches(text, position=0) {
*iter_matches(text, start_position=0, end_position=undefined) {
// Normalize positions
[start_position, end_position] = normalize_bounds(text, start_position, end_position);
while (true) {
const pending = this.closest_scanning_match(text, position);
const pending = this.closest_scanning_match(text, start_position, end_position);
if (pending) {
yield pending;
}
if (!pending || pending.pending_index === null) {
// CLARIFICATION: loose equality ( == ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
if (!pending || pending.pending_index == null || pending.pending_index === end_position ) {
break;
}
position = pending.pending_index;
start_position = pending.pending_index;
}
}