Worked on #3 where I believe the regexp part is now complete but it needs testing

Added math sub system planning document written by Claude
2026-05-25 02:29:57 +02:00 · 2026-05-25 02:28:55 +02:00
4 changed files with 163 additions and 44 deletions
--- a/experiments/regexp-tokenizer.mjs
+++ b/experiments/regexp-tokenizer.mjs
@@ -12,4 +12,15 @@ rt.set_default_identifier('random stuff');

 for (const m of rt.iter_matches('#Hello World!')) {
 	console.log({class: m.constructor.name, identifier: m.identifier, value: m.value, captured: m.captured });
-};
+};
+
+
+console.log('--=| Slicing |=--')
+
+for (const m of rt.iter_matches('#Hello World!', 3, -3)) {
+	//console.log(m, m.pending_index)
+	console.log({class: m.constructor.name, identifier: m.identifier, value: m.value, captured: m.captured });
+
+
+};
+
--- a/planning/math-subsystem.md
+++ b/planning/math-subsystem.md
@@ -0,0 +1,79 @@
+# Mathematical Expression Subsystem
+
+> [!NOTE]
+> This document is written by Claude by Anthropic using Sonnet 4.6 and has yet to be vetted by Mikael Lövqvist
+
+## Overview
+
+A math-like expression language built on top of the reduction scanner, supporting
+operator notation, matrix literals, subscripts, superscripts, and symbolic operators.
+
+## Operator Notation
+
+Operators are identified by their symbol name rather than semantic meaning, since the
+same symbol can mean different things depending on operand types:
+
+- `*` (ASTERISK) — could be scalar multiplication, Hadamard product, or scale depending on types
+- `·` (DOT) — dot product
+- `×` (CROSS) — cross product
+- `⊕` (OPLUS) — direct sum or XOR
+
+Semantic resolution (e.g. `ASTERISK(matrix, matrix)` → Hadamard) is a separate
+type-inference pass, not part of the structural reduction.
+
+## ASCII Input for Special Symbols
+
+LaTeX-inspired escape sequences for entering special symbols in plain ASCII:
+
+- `\oplus` → ⊕
+- `\times` → ×
+- `\cdot` → ·
+- `\otimes` → ⊗
+
+`^` is reserved for superscript (not XOR), `_` for subscript. `S_12` reads as S₁₂.
+
+## Matrix Literals
+
+Single-line input using nested brackets:
+
+```
+[[1, 0, 0], [0, 1, 0], [0, 0, 1]]
+```
+
+Pretty-printed output using Unicode bracket characters:
+
+```
+⎡1 0 0⎤
+⎢0 1 0⎥
+⎣0 0 1⎦
+```
+
+## 2D Raster Reduction Scanner
+
+For parsing pretty-printed multi-line matrix literals within larger expressions like
+`M + 2 * N` where M and N are written in 2D notation, a raster-based reduction pass
+is needed before the standard 1D reduction pass.
+
+### Approach
+
+1. **Raster pass first** — operate on a 2D grid of characters
+2. Locate matrix corner anchors `⎡⎤⎣⎦` — these are highly selective so candidate
+   detection is cheap
+3. Scan right for `⎤`, down for `⎣`, verify `⎦` at intersection
+4. Use `⎢`/`⎥` to identify row boundaries within the region
+5. Collapse the identified rectangle into a single matrix token
+6. **1D pass second** — the surrounding expression now contains ordinary tokens and
+   the collapsed matrix nodes, reducible by standard rules
+
+### Scope Boundaries
+
+Fraction bars define containment — a matrix appearing in a numerator or denominator
+is only part of that sub-expression. The horizontal extent of the fraction bar bounds
+the operand scan. Containment must be resolved outside-in: find outermost structure
+first, recurse into sub-regions.
+
+### Generalization
+
+A 2D reduction scanner is a natural generalization of the 1D scanner — the "sequence"
+becomes a 2D array and conditions match spatial patterns rather than linear ones.
+The same anchor-point and backtracking concepts apply.
--- a/source/errors.mjs
+++ b/source/errors.mjs
@@ -6,7 +6,7 @@ import { inspect } from 'node:util';

 export class Tokenization_Error extends Error {
 	constructor(data) {
-		const { parser, value, index, end_index } = data;
+		const {parser, text, start_position, end_position, match_start, match_end, value} = data;
 		super(`Tokenization_Error`);	//TODO: Format message
 		this.data = data;
 	}
--- a/source/parsing/regexp-dispatch.mjs
+++ b/source/parsing/regexp-dispatch.mjs
@@ -7,13 +7,24 @@ import { Tokenization_Error } from '@efforting.tech/errors';
 //
 //			Specifically it is not currently decided where the boundary between rule/action/capture should be

+
+function normalize_bounds(text, start_position, end_position) {
+	const len = text.length;
+	const norm_start = start_position < 0 ? Math.max(0, len + start_position) : start_position;
+	const norm_end = end_position == undefined ? undefined : (end_position < 0 ? Math.max(0, len + end_position) : end_position);
+	return [norm_start, norm_end];
+}
+
+
 export class Pattern_Match {
-	constructor(match, rule) {
-		Object.assign(this, { match, rule });
+	constructor(text, start_position, end_position, match, rule) {
+		// Normalize positions
+		[start_position, end_position] = normalize_bounds(text, start_position, end_position);
+		Object.assign(this, { text, start_position, end_position, match, rule });
 	}

 	get identifier() {
-		return this.rule. identifier;
+		return this.rule.identifier;
 	}

 	get value() {
@@ -24,28 +35,34 @@ export class Pattern_Match {
 		return this.match.slice(1);
 	}

-	get pending_index() {
-		return this.match.index + this.match[0].length;
+	get absolute_start() {
+		return this.match.index + this.start_position;
 	}

+	get absolute_end() {
+		return this.match.index + this.start_position + this.match[0].length - 1;
+	}
+
+	get pending_index() {
+		return this.match.index + this.start_position + this.match[0].length;
+	}
 }

 export class Default_Match {
-	constructor(text, index, end_index, action) {
-		const identifier = action(this);
-		Object.assign(this, { text, index, end_index, action, identifier });
-	}
+	//TBD: Here we invoke action while creating this object, and assign the identifier but we don't do that on Pattern_Match - this feels a bit sketchy
+	constructor(text, start_position, end_position, match_start, match_end, value, action) {
+		// Normalize positions
+		[start_position, end_position] = normalize_bounds(text, start_position, end_position);
+		[match_start, match_end] = normalize_bounds(text, match_start, match_end);

-	get value() {
-		return this.text;
+		const identifier = action(this);	//TODO: action protocol in accordance with issue #5
+		Object.assign(this, { text, start_position, end_position, match_start, match_end, value, action, identifier });
 	}

-
 	get pending_index() {
-		if (this.end_index === null) {
-			return null;
-		} else {
-			return this.end_index;
+		// CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
+		if (this.match_end != undefined) {
+			return this.match_end + 1;
 		}
 	}
 }
@@ -76,8 +93,6 @@ export class RegExp_Token_Rule extends Abstract_RegExp_Token_Rule {
 	}
 }

-// Note: There is no clean built in way to set an end position of a RegExp pattern, the only generic way is to slice the string we match before.
-//		 We may at some point implement support for this (and it would only be done if end position was given)

 export class RegExp_Tokenizer {
 	constructor(rules=[], default_action=undefined) {
@@ -94,53 +109,58 @@ export class RegExp_Tokenizer {
 		this.rules.push(...rules_to_add);
 	}

-	immediate_match(text, position=0) {
+	immediate_match(text, start_position=0, end_position=undefined) {
+		// CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
+		const bounded = start_position !== 0 || end_position != undefined;
+		const text_to_search = bounded ? text.slice(start_position, end_position) : text;
+
 		for (const rule of this.rules) {
 			const pattern = rule.immediate_pattern;
-			pattern.lastIndex = position;
-			const match = pattern.exec(text);
+			pattern.lastIndex = 0;
+			const match = pattern.exec(text_to_search);
 			if (match) {
-				return new Pattern_Match(match, rule);
+				return new Pattern_Match(text, start_position, end_position, match, rule);
 			}
 		}
 	}


-	_handle_default_match(value, index, end_index=null) {
+	_handle_default_match(text, start_position, end_position, match_start, match_end, value) {
 		const { default_action } = this;
 		if (!default_action) {
-			throw new Tokenization_Error({ parser: this, value, index, end_index });
+			throw new Tokenization_Error({ parser: this, text, start_position, end_position, match_start, match_end, value });
 		}
-		return new Default_Match(value, index, end_index, default_action);
+		return new Default_Match(text, start_position, end_position, match_start, match_end, value, default_action);
 	}


-	closest_scanning_match(text, position=0) {
-		const immediate_match = this.immediate_match(text, position);
+	closest_scanning_match(text, start_position=0, end_position=undefined) {
+
+		const immediate_match = this.immediate_match(text, start_position, end_position);
 		if (immediate_match) {
 			return immediate_match;
 		}

 		let best_candidate;
-		for (const candidate of this.iter_scanning_rule_candidates(text, position)) {
-			if ((best_candidate === undefined) || (best_candidate.match.index > candidate.match.index)) {
+		for (const candidate of this.iter_scanning_rule_candidates(text, start_position, end_position)) {
+			if ((best_candidate === undefined) || (best_candidate.absolute_start > candidate.absolute_start)) {
 				best_candidate = candidate;
 			}
 		}

 		// There was no match, just get the tail
 		if (!best_candidate) {
-			const tail = text.slice(position);
+			const tail = text.slice(start_position);
 			if (tail.length) {
-				return this._handle_default_match(tail, position);
+				return this._handle_default_match(text, start_position, end_position, start_position, end_position, tail);
 			}
 		}

 		// There was a match, check the head
 		if (best_candidate) {
-			const head = text.slice(position, best_candidate.match.index);
+			const head = text.slice(start_position, best_candidate.absolute_start);
 			if (head.length) {
-				return this._handle_default_match(head, position, best_candidate.match.index);
+				return this._handle_default_match(text, start_position, end_position, start_position, best_candidate.absolute_start - 1, head);
 			}
 		}

@@ -149,32 +169,41 @@ export class RegExp_Tokenizer {
 	}


-	*iter_scanning_rule_candidates(text, position=0) {
-		// Iterates over all rules and yields any matches found anywhere (but only once per rule)
+	*iter_scanning_rule_candidates(text, start_position=0, end_position=undefined) {
+		// CLARIFICATION: loose inequality ( != ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
+		const bounded = start_position !== 0 || end_position != undefined;
+		const text_to_search = bounded ? text.slice(start_position, end_position) : text;

+
+		// Iterates over all rules and yields any matches found anywhere (but only once per rule)
 		for (const rule of this.rules) {
 			const pattern = rule.scanning_pattern;
-			pattern.lastIndex = position;
-			const match = pattern.exec(text);
+			pattern.lastIndex = 0;
+			const match = pattern.exec(text_to_search);

 			if (match) {
-				yield new Pattern_Match(match, rule);
+				yield new Pattern_Match(text, start_position, end_position, match, rule);
 			}
 		}

 	}


-	*iter_matches(text, position=0) {
+	*iter_matches(text, start_position=0, end_position=undefined) {
+
+		// Normalize positions
+		[start_position, end_position] = normalize_bounds(text, start_position, end_position);
+
 		while (true) {
-			const pending = this.closest_scanning_match(text, position);
+			const pending = this.closest_scanning_match(text, start_position, end_position);
 			if (pending) {
 				yield pending;
 			}
-			if (!pending || pending.pending_index === null) {
+			// CLARIFICATION: loose equality ( == ) matches null and undefined but not false/0/'' but we use strict for start_position since it is always a number
+			if (!pending || pending.pending_index == null || pending.pending_index === end_position ) {
 				break;
 			}
-			position = pending.pending_index;
+			start_position = pending.pending_index;

 		}
 	}
Author	SHA1	Message	Date
Mikael Lövqvist	d584a49579	Worked on #3 where I believe the regexp part is now complete but it needs testing	2026-05-25 02:29:57 +02:00
Mikael Lövqvist	1842d3de9c	Added math sub system planning document written by Claude	2026-05-25 02:28:55 +02:00