nodejs.esm-library/source/parsing/regexp-dispatch.mjs

import * as RE from '@efforting.tech/text/regexp';
import { Tokenization_Error } from '@efforting.tech/errors';


// NOTE: 	There are some open questions about this implementation and API which may change as the library matures.
//			Check out the example at experiments/regexp-tokenizer.mjs for more information on how to use this in its current state.
//
//			Specifically it is not currently decided where the boundary between rule/action/capture should be

export class Pattern_Match {
	constructor(match, rule) {
		Object.assign(this, { match, rule });
	}

	get identifier() {
		return this.rule. identifier;
	}

	get value() {
		return this.match[0];
	}

	get captured() {
		return this.match.slice(1);
	}

	get pending_index() {
		return this.match.index + this.match[0].length;
	}

}

export class Default_Match {
	constructor(text, index, end_index, action) {
		const identifier = action(this);
		Object.assign(this, { text, index, end_index, action, identifier });
	}

	get value() {
		return this.text;
	}


	get pending_index() {
		if (this.end_index === null) {
			return null;
		} else {
			return this.end_index;
		}
	}
}


export class Abstract_RegExp_Token_Rule {

	constructor(pattern) {

		const pattern_source = RE.get_source(pattern);
		const pattern_flags = RE.get_flags(pattern);

		const immediate_flags = String.prototype.concat(...(new Set([...pattern_flags, 'y'])));
		const scanning_flags =  String.prototype.concat(...(new Set([...pattern_flags, 'g'])));

		const immediate_pattern = new RegExp(pattern_source, immediate_flags);
		const scanning_pattern = new RegExp(pattern_source, scanning_flags);

		Object.assign(this, { pattern, immediate_pattern, scanning_pattern });
	}

}

export class RegExp_Token_Rule extends Abstract_RegExp_Token_Rule {
	constructor(pattern, identifier=this) {
		super(pattern);
		Object.assign(this, { identifier });
	}
}

export class RegExp_Tokenizer {
	constructor(rules=[], default_action=undefined) {
		Object.assign(this, { rules, default_action });
	}

	set_default_identifier(identifier) {
		this.default_action = (
			() => identifier
		);
	}

	add_rules(...rules_to_add) {
		this.rules.push(...rules_to_add);
	}

	immediate_match(text, position=0) {
		for (const rule of this.rules) {
			const pattern = rule.immediate_pattern;
			pattern.lastIndex = position;
			const match = pattern.exec(text);
			if (match) {
				return new Pattern_Match(match, rule);
			}
		}
	}


	_handle_default_match(value, index, end_index=null) {
		const { default_action } = this;
		if (!default_action) {
			throw new Tokenization_Error({ parser: this, value, index, end_index });
		}
		return new Default_Match(value, index, end_index, default_action);
	}


	closest_scanning_match(text, position=0) {
		const immediate_match = this.immediate_match(text, position);
		if (immediate_match) {
			return immediate_match;
		}

		let best_candidate;
		for (const candidate of this.iter_scanning_rule_candidates(text, position)) {
			if ((best_candidate === undefined) || (best_candidate.match.index > candidate.match.index)) {
				best_candidate = candidate;
			}
		}

		// There was no match, just get the tail
		if (!best_candidate) {
			const tail = text.slice(position);
			if (tail.length) {
				return this._handle_default_match(tail, position);
			}
		}

		// There was a match, check the head
		if (best_candidate) {
			const head = text.slice(position, best_candidate.match.index);
			if (head.length) {
				return this._handle_default_match(head, position, best_candidate.match.index);
			}
		}

		return best_candidate;

	}


	*iter_scanning_rule_candidates(text, position=0) {
		// Iterates over all rules and yields any matches found anywhere (but only once per rule)

		for (const rule of this.rules) {
			const pattern = rule.scanning_pattern;
			pattern.lastIndex = position;
			const match = pattern.exec(text);

			if (match) {
				yield new Pattern_Match(match, rule);
			}
		}

	}


	*iter_matches(text, position=0) {
		while (true) {
			const pending = this.closest_scanning_match(text, position);
			if (pending) {
				yield pending;
			}
			if (!pending || pending.pending_index === null) {
				break;
			}
			position = pending.pending_index;

		}
	}


}