diff --git a/Makefile b/Makefile index fbf6111..c163301 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,10 @@ build/packages: publish: cd build/packages && ./publish-all.sh +dev: + build/packages/local-install.sh + clean: rm -rf build -.PHONY: clean build/packages publish \ No newline at end of file +.PHONY: clean build/packages publish dev \ No newline at end of file diff --git a/experiments/regexp-tokenizer.mjs b/experiments/regexp-tokenizer.mjs new file mode 100644 index 0000000..eb34226 --- /dev/null +++ b/experiments/regexp-tokenizer.mjs @@ -0,0 +1,15 @@ +import { RegExp_Tokenizer, RegExp_Token_Rule } from '@efforting.tech/parsing/regexp-dispatch'; + + +const rt = new RegExp_Tokenizer(); + +rt.add_rules(new RegExp_Token_Rule(/\w+/, 'word')); +rt.set_default_identifier('random stuff'); + +//console.log(rt.rules); + +//console.log(rt.closest_scanning_match('#Hello World!')); + +for (const m of rt.iter_matches('#Hello World!')) { + console.log({class: m.constructor.name, identifier: m.identifier, value: m.value, captured: m.captured }); +}; \ No newline at end of file diff --git a/package-manifest.yaml b/package-manifest.yaml index d20d39c..18c7b85 100644 --- a/package-manifest.yaml +++ b/package-manifest.yaml @@ -35,6 +35,14 @@ packages: internal-dependencies: - data + parsing: + path: source/parsing + #documentation: documentation/text + description: Generic string parsing + internal-dependencies: + - errors + - text + wip-packages: object-graph-storage: path: source/object-graph-storage diff --git a/source/parsing/regexp-dispatch.mjs b/source/parsing/regexp-dispatch.mjs new file mode 100644 index 0000000..300e802 --- /dev/null +++ b/source/parsing/regexp-dispatch.mjs @@ -0,0 +1,180 @@ +import * as RE from '@efforting.tech/text/regexp'; + +// NOTE: There are some open questions about this implementation and API which may change as the library matures. +// Check out the example at experiments/regexp-tokenizer.mjs for more information on how to use this in its current state. +// +// Specifically it is not currently decided where the boundary between rule/action/capture should be + +export class Pattern_Match { + constructor(match, rule) { + Object.assign(this, { match, rule }); + } + + get identifier() { + return this.rule. identifier; + } + + get value() { + return this.match[0]; + } + + get captured() { + return this.match.slice(1); + } + + get pending_index() { + return this.match.index + this.match[0].length; + } + +} + +export class Default_Match { + constructor(text, index, end_index, action) { + const identifier = action(this); + Object.assign(this, { text, index, end_index, action, identifier }); + } + + get value() { + return this.text; + } + + + get pending_index() { + if (this.end_index === null) { + return null; + } else { + return this.end_index; + } + } +} + + +export class Abstract_RegExp_Token_Rule { + + constructor(pattern) { + + const pattern_source = RE.get_source(pattern); + const pattern_flags = RE.get_flags(pattern); + + const immediate_flags = String.prototype.concat(...(new Set([...pattern_flags, 'y']))); + const scanning_flags = String.prototype.concat(...(new Set([...pattern_flags, 'g']))); + + const immediate_pattern = new RegExp(pattern_source, immediate_flags); + const scanning_pattern = new RegExp(pattern_source, scanning_flags); + + Object.assign(this, { pattern, immediate_pattern, scanning_pattern }); + } + +} + +export class RegExp_Token_Rule extends Abstract_RegExp_Token_Rule { + constructor(pattern, identifier=this) { + super(pattern); + Object.assign(this, { identifier }); + } +} + +export class RegExp_Tokenizer { + constructor(rules=[], default_action=undefined) { + Object.assign(this, { rules, default_action }); + } + + set_default_identifier(identifier) { + this.default_action = ( + () => identifier + ); + } + + add_rules(...rules_to_add) { + this.rules.push(...rules_to_add); + } + + immediate_match(text, position=0) { + for (const rule of this.rules) { + const pattern = rule.immediate_pattern; + pattern.lastIndex = position; + const match = pattern.exec(text); + if (match) { + return new Pattern_Match(match, rule); + } + } + } + + + _handle_default_match(value, index, end_index=null) { + const { default_action } = this; + if (!default_action) { + throw new Parsing_Error({ parser: this, value, index, end_index }); + } + return new Default_Match(value, index, end_index, default_action); + } + + + closest_scanning_match(text, position=0) { + const immediate_match = this.immediate_match(text, position); + if (immediate_match) { + return immediate_match; + } + + let best_candidate; + for (const candidate of this.iter_scanning_rule_candidates(text, position)) { + if ((best_candidate === undefined) || (best_candidate.match.index > candidate.match.index)) { + best_candidate = candidate; + } + } + + // There was no match, just get the tail + if (!best_candidate) { + const tail = text.slice(position); + if (tail.length) { + return this._handle_default_match(tail, position); + } + } + + // There was a match, check the head + if (best_candidate) { + const head = text.slice(position, best_candidate.match.index); + if (head.length) { + return this._handle_default_match(head, position, best_candidate.match.index); + } + } + + return best_candidate; + + } + + + *iter_scanning_rule_candidates(text, position=0) { + // Iterates over all rules and yields any matches found anywhere (but only once per rule) + + for (const rule of this.rules) { + const pattern = rule.scanning_pattern; + pattern.lastIndex = position; + const match = pattern.exec(text); + + if (match) { + yield new Pattern_Match(match, rule); + } + } + + } + + + *iter_matches(text, position=0) { + while (true) { + const pending = this.closest_scanning_match(text, position); + if (pending) { + yield pending; + } + if (!pending || pending.pending_index === null) { + break; + } + position = pending.pending_index; + + } + } + + + +} + diff --git a/source/text/regexp.mjs b/source/text/regexp.mjs new file mode 100644 index 0000000..f01ea52 --- /dev/null +++ b/source/text/regexp.mjs @@ -0,0 +1,49 @@ +export function get_flags(pattern) { + if (pattern instanceof RegExp) { + return new Set(pattern.flags); + } else { + return new Set(); + } +} +export function get_source(pattern) { + if (pattern instanceof RegExp) { + return pattern.source; + } else { + return pattern; + } +} + +export function concat(...pattern_list) { + let pending_source = ''; + const pending_flags = new Set(); + + for (const pattern of pattern_list) { + + if (pattern instanceof RegExp) { + pending_source += pattern.source; + for (const flag of pattern.flags) { + pending_flags.add(flag); + } + } else { + pending_source += pattern; + } + } + + return new RegExp(pending_source, String.prototype.concat(...pending_flags)); +} + + +export function join(pattern_list, separator, flags=undefined) { + return new RegExp(pattern_list.map(pattern => get_source(pattern)).join(get_source(separator)), flags); +} + + +export function update_flag(pattern, flag, state) { + const pattern_flags = get_flags(pattern); + if (state) { + pattern_flags.add(flag); + } else { + pattern_flags.delete(flag); + } + return new RegExp(pattern.source, String.prototype.concat(...pattern_flags)); +} \ No newline at end of file