nodejs.esm-library/experiments/generic-parser-1.mjs

import { RegExp_Tokenizer, RegExp_Token_Rule } from '@efforting.tech/parsing/regexp-dispatch';
import { Switchable_Iterator } from '@efforting.tech/data/iteration-utilities';
import { String_Keyed_Stack } from '@efforting.tech/data/stack';
import { assign_defined } from '@efforting.tech/data/object-utilities';
import * as F from '@efforting.tech/schema/field-configuration-factories';
import { inspect } from 'node:util';


class RegExp_Token_Parsing_Rule extends RegExp_Token_Rule {
	constructor(pattern, action, identifier=undefined) {
		super(pattern, identifier);
		Object.assign(this, { action });
	}
}


const Parser_State = new F.Schema({

	position: F.value(0, 'Pending position in source'),
	value: F.factory(() => [], 'Pending value to return'),
	sub_tokenizer_handlers: F.factory(() => [], 'Pending sub tokenizer handlers'),
	tokenizer: F.value(null, 'Current tokenizer'),
	context: F.value(null, 'User supplied context'),

}, 'Parser state');


class Parser {
	constructor(source, state=undefined) {
		state = Parser_State.load(state);
		const token_generator = new Switchable_Iterator();
		const stack = new String_Keyed_Stack(state);
		Object.assign(this, { source, state, stack, token_generator });
		this.switch_to();
	}


	switch_to(tokenizer=undefined, position=undefined) {
		assign_defined(this.state, { tokenizer, position });
		this.token_generator.switch_to(this.state.tokenizer.iter_matches(this.source, this.state.position));
	}

	parse(handler=undefined) {

		for (const match of this.token_generator) {
			const { action } = match.rule;
			if (!action) { continue; }
			if (typeof action !== 'function') {
				console.log('NOT IMPLEMENTED', match.rule.action);
				continue;
			}

			this.state.position = match.pending_index;
			this.state.match = match;
			action(this, match);

		}

		if (handler) {
			this.state.match = null;	//TODO: Decide if we should reset match here or not
			handler(this, this.state.value);
			return this.state.value;
		} else {
			return this.state.value;
		}


	}

	push_token(...tokens) {
		this.state.value.push(...tokens);
	}

	replace_value(value) {
		this.state.value = value;
	}

	enter_sub_tokenizer(tokenizer=undefined, handler=undefined) {
		this.stack.push_defined({ tokenizer, value: [] });
		if (handler) {
			this.state.sub_tokenizer_handlers.push(handler);
		}
		this.switch_to(tokenizer);
	}

	leave_sub_tokenizer() {
		const frame = this.stack.pop(true);
		const { sub_tokenizer_handlers } = this.state;

		if (sub_tokenizer_handlers.length) {
			const handler = sub_tokenizer_handlers.pop();
			this.state.match = null;	//TODO: Decide if we should reset match here or not
			handler(this, frame.value);
		} else {
			this.push_token(frame.value);
		}
		this.switch_to();
	}

}


const text = 'Hello World (how are you (doing)) I may ask';

const rt = new RegExp_Tokenizer();
rt.add_rules(new RegExp_Token_Parsing_Rule(/\w+/, (tokenizer, match) => tokenizer.push_token(match.value), 'word'));
rt.add_rules(new RegExp_Token_Parsing_Rule(/\s+/, null, 'space'));
rt.add_rules(new RegExp_Token_Parsing_Rule(RegExp.escape('('), (tokenizer, match) => tokenizer.enter_sub_tokenizer(undefined, (tokenizer, value) => tokenizer.push_token({kind: 'sub expression', value}) ), 'lpar'));
rt.add_rules(new RegExp_Token_Parsing_Rule(RegExp.escape(')'), (tokenizer, match) => tokenizer.leave_sub_tokenizer(), 'rpar'));

const p = new Parser(text, { tokenizer: rt });


console.log(inspect(p.parse((tokenizer, value) => tokenizer.replace_value({kind: 'parsing result', value})), { colors: true, depth: null }));

/*

{
  kind: 'parsing result',
  value: [
    'Hello',
    'World',
    {
      kind: 'sub expression',
      value: [
        'how',
        'are',
        'you',
        { kind: 'sub expression', value: [ 'doing' ] }
      ]
    },
    'I',
    'may',
    'ask'
  ]
}


*/


process.exit()


for (const m of rt.iter_matches(text)) {
	console.log({identifier: m.identifier, value: m.value });
};