From 148b56df1932e6c5a73486bf8e501bf66af049ad Mon Sep 17 00:00:00 2001 From: loyaltypollution <65063925+loyaltypollution@users.noreply.github.com> Date: Tue, 28 Oct 2025 20:25:42 +0800 Subject: [PATCH 1/4] Refactor error handling and update imports - Moved error handling logic to a dedicated errors module, improving organization and maintainability. --- src/errors.ts | 289 ----------------------------- src/errors/errors.ts | 56 +++--- src/errors/index.ts | 1 + src/generate-ast.ts | 2 +- src/parser/errors.ts | 47 +++++ src/parser/index.ts | 1 + src/{ => parser}/parser.ts | 6 +- src/resolver/errors.ts | 60 ++++++ src/resolver/index.ts | 1 + src/{ => resolver}/resolver.ts | 6 +- src/tokenizer/errors.ts | 112 +++++++++++ src/tokenizer/index.ts | 1 + src/{ => tokenizer}/tokenizer.ts | 33 ++-- src/translator/errors.ts | 26 +++ src/translator/index.ts | 1 + src/{ => translator}/translator.ts | 8 +- 16 files changed, 312 insertions(+), 338 deletions(-) delete mode 100644 src/errors.ts create mode 100644 src/errors/index.ts create mode 100644 src/parser/errors.ts create mode 100644 src/parser/index.ts rename src/{ => parser}/parser.ts (99%) create mode 100644 src/resolver/errors.ts create mode 100644 src/resolver/index.ts rename src/{ => resolver}/resolver.ts (99%) create mode 100644 src/tokenizer/errors.ts create mode 100644 src/tokenizer/index.ts rename src/{ => tokenizer}/tokenizer.ts (99%) create mode 100644 src/translator/errors.ts create mode 100644 src/translator/index.ts rename src/{ => translator}/translator.ts (99%) diff --git a/src/errors.ts b/src/errors.ts deleted file mode 100644 index d0f4b6e..0000000 --- a/src/errors.ts +++ /dev/null @@ -1,289 +0,0 @@ -import { createErrorIndicator } from "./errors/errors"; -import {Token} from "./tokenizer"; -import {Position} from "estree"; - -/* - The offset is calculated as follows: - Current position is one after real position of end of token: 1 -*/ -const MAGIC_OFFSET = 1; - -const SPECIAL_CHARS = new RegExp("[\\\\$'\"]", "g"); - -function escape(unsafe: string): string { - // @TODO escape newlines - return unsafe.replace(SPECIAL_CHARS, "\\$&"); -} - -/* Searches backwards and forwards till it hits a newline */ -function getFullLine(source: string, current: number): {lineIndex: number; msg: string} { - let back: number = current; - let forward: number = current; - if (source[back] == '\n') { - back--; - } - while (back > 0 && source[back] != '\n') { - back--; - } - if (source[back] === '\n') { - back++; - } - while (forward < source.length && source[forward] != '\n') { - forward++; - } - const lineIndex = source.slice(0, back).split('\n').length; - const msg = source.slice(back, forward); - - return {lineIndex, msg}; -} - -function toEstreeLocation(line: number, column: number, offset: number) { - return {line, column, offset} -} - -export namespace TokenizerErrors { - export class BaseTokenizerError extends SyntaxError { - line: number; - col: number; - loc: Position; - - constructor(message: string, line: number, col: number) { - super(`SyntaxError at line ${line} column ${col-1} - ${message}`); - this.line = line; - this.col = col; - this.name = "BaseTokenizerError"; - this.loc = toEstreeLocation(line, col, 0); - } - } - - export class UnknownTokenError extends BaseTokenizerError { - constructor(token: string, line: number, col: number, source: string, current: number) { - let { lineIndex, msg } = getFullLine(source, current-1); - msg = '\n' + msg + '\n'; - let hint = `${col > 1 ? '~' : ''}^~ Unknown token '${escape(token)}'`; - // The extra `~` character takes up some space. - hint = hint.padStart(hint.length + col - MAGIC_OFFSET - (col > 1 ? 1 : 0), " "); - super(msg + hint, lineIndex, col); - this.name = "UnknownTokenError"; - } - } - - export class UnterminatedStringError extends BaseTokenizerError { - constructor(line: number, col: number, source: string, start: number, current: number) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = `^ Unterminated string`; - const diff = (current - start); - // +1 because we want the arrow to point after the string (where we expect the closing ") - hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "~"); - hint = hint.padStart(hint.length + col - diff, " "); - super(msg + hint, lineIndex, col); - this.name = "UnterminatedStringError"; - } - } - - export class NonFourIndentError extends BaseTokenizerError { - constructor(line: number, col: number, source: string, start: number) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = `^ This indent should be a multiple of 4 spaces. It's currently ${col} spaces.`; - hint = hint.padStart(hint.length + col - MAGIC_OFFSET, "-"); - super(msg + hint, lineIndex, col); - this.name = "NonFourIndentError"; - } - } - - export class InvalidNumberError extends BaseTokenizerError { - constructor(line: number, col: number, source: string, start: number, current: number) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = `^ Invalid Number input.`; - const diff = (current - start); - // +1 because we want the arrow to point after the string (where we expect the closing ") - hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "~"); - hint = hint.padStart(hint.length + col - diff, " "); - super(msg + hint, lineIndex, col); - this.name = "InvalidNumberError"; - } - } - - export class InconsistentIndentError extends BaseTokenizerError { - constructor(line: number, col: number, source: string, start: number) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = `^ This indent/dedent is inconsistent with other indents/dedents. It's currently ${col} spaces.`; - hint = hint.padStart(hint.length + col - MAGIC_OFFSET, "-"); - super(msg + hint, lineIndex, col); - this.name = "InconsistentIndentError"; - } - } - export class ForbiddenIdentifierError extends BaseTokenizerError { - constructor(line: number, col: number, source: string, start: number) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = `^ This identifier is reserved for use in Python. Consider using another identifier.`; - hint = hint.padStart(hint.length + col - MAGIC_OFFSET, "^"); - super(msg + hint, lineIndex, col); - this.name = "ForbiddenIdentifierError"; - } - } - export class ForbiddenOperatorError extends BaseTokenizerError { - constructor(line: number, col: number, source: string, start: number, current: number) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = ` This operator is reserved for use in Python. It's not allowed to be used.`; - const diff = (current - start); - hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "^"); - hint = hint.padStart(hint.length + col - diff, " "); - super(msg + hint, lineIndex, col); - this.name = "ForbiddenOperatorError"; - } - } - - export class NonMatchingParenthesesError extends BaseTokenizerError { - constructor(line: number, col: number, source: string, current: number) { - let { lineIndex, msg } = getFullLine(source, current-1); - msg = '\n' + msg + '\n'; - let hint = `${col > 1 ? '~' : ''}^~ Non-matching closing parentheses.`; - // The extra `~` character takes up some space. - hint = hint.padStart(hint.length + col - MAGIC_OFFSET - (col > 1 ? 1 : 0), " "); - super(msg + hint, lineIndex, col); - this.name = "NonMatchingParenthesesError"; - } - } -} - -export namespace ParserErrors { - export class BaseParserError extends SyntaxError { - line: number; - col: number; - loc: Position; - - constructor(message: string, line: number, col: number) { - super(`SyntaxError at line ${line} - ${message}`); - this.line = line; - this.col = col; - this.name = "BaseParserError"; - this.loc = toEstreeLocation(line, col, 0); - } - } - export class ExpectedTokenError extends BaseParserError { - constructor(source: string, current: Token, expected: string) { - let { lineIndex, msg } = getFullLine(source, current.indexInSource - current.lexeme.length); - msg = '\n' + msg + '\n'; - let hint = `^ ${expected}. Found '${escape(current.lexeme)}'.`; - hint = hint.padStart(hint.length + current.col - MAGIC_OFFSET, " "); - super(msg + hint, lineIndex, current.col); - this.name = "ExpectedTokenError"; - } - } - export class NoElseBlockError extends BaseParserError { - constructor(source: string, current: Token) { - let { lineIndex, msg } = getFullLine(source, current.indexInSource); - msg = '\n' + msg + '\n'; - let hint = `^ Expected else block after this if block.`; - hint = hint.padStart(hint.length + current.col - MAGIC_OFFSET, " "); - super(msg + hint, lineIndex, current.col); - this.name = "ExpectedTokenError"; - } - } - export class GenericUnexpectedSyntaxError extends BaseParserError { - constructor(line: number, col: number, source: string, start: number, current: number) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = ` Detected invalid syntax.`; - const indicator = createErrorIndicator(msg, '@'); - super(msg + indicator + hint, lineIndex, col); - this.name = "GenericUnexpectedSyntaxError"; - } - } -} - -export namespace ResolverErrors { - export class BaseResolverError extends SyntaxError { - line: number; - col: number; - loc: Position; - - constructor(name: string, message: string, line: number, col: number) { - super(`${name} at line ${line} - ${message}`); - this.line = line; - this.col = col; - this.name = "BaseResolverError"; - this.loc = toEstreeLocation(line, col, 0); - } - } - export class NameNotFoundError extends BaseResolverError { - constructor(line: number, col: number, source: string, start: number, - current: number, suggestion: string | null) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = ` This name is not found in the current or enclosing environment(s).`; - const diff = (current - start); - hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "^"); - hint = hint.padStart(hint.length + col - diff, " "); - if (suggestion !== null) { - let sugg = ` Perhaps you meant to type '${suggestion}'?` - sugg = sugg.padStart(sugg.length + col - MAGIC_OFFSET + 1, " "); - sugg = '\n' + sugg; - hint += sugg; - } - const name = "NameNotFoundError"; - super(name, msg + hint, lineIndex, col); - this.name = "NameNotFoundError"; - } - } - - export class NameReassignmentError extends BaseResolverError { - constructor(line: number, col: number, source: string, start: number, - current: number, oldName: Token) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = ` A name has been declared here.`; - const diff = (current - start); - hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "^"); - hint = hint.padStart(hint.length + col - diff, " "); - let { lineIndex: oldLine, msg: oldNameLine } = getFullLine(source, oldName.indexInSource); - oldNameLine = '\n' + oldNameLine + '\n'; - let sugg = ` However, it has already been declared in the same environment at line ${oldLine}, here: ` - sugg = sugg.padStart(sugg.length + col - MAGIC_OFFSET + 1, " "); - sugg = '\n' + sugg; - hint += sugg; - oldNameLine.padStart(oldNameLine.length + col - MAGIC_OFFSET + 1, " "); - hint += oldNameLine; - const name = "NameReassignmentError"; - super(name, msg + hint, lineIndex, col); - this.name = "NameReassignmentError"; - } - } -} - -export namespace TranslatorErrors { - export class BaseTranslatorError extends SyntaxError { - line: number; - col: number; - loc: Position; - - constructor(message: string, line: number, col: number) { - super(`BaseTranslatorError at line ${line} column ${col-1} - ${message}`); - this.line = line; - this.col = col; - this.name = "BaseTranslatorError"; - this.loc = toEstreeLocation(line, col, 0); - } - } - export class UnsupportedOperator extends BaseTranslatorError { - constructor(line: number, col: number, source: string, start: number) { - let { lineIndex, msg } = getFullLine(source, start); - msg = '\n' + msg + '\n'; - let hint = `^ This operator is not yet supported by us.`; - hint = hint.padStart(hint.length + col - MAGIC_OFFSET, " "); - super(msg + hint, lineIndex, col); - this.name = "UnsupportedOperator"; - } - } -} \ No newline at end of file diff --git a/src/errors/errors.ts b/src/errors/errors.ts index 8b3b4f8..f413f87 100644 --- a/src/errors/errors.ts +++ b/src/errors/errors.ts @@ -74,7 +74,7 @@ function typeTranslator(type: string): string { } /* Searches backwards and forwards till it hits a newline */ -function getFullLine(source: string, current: number): { line: number; fullLine: string } { +export function getFullLine(source: string, current: number): { lineIndex: number; fullLine: string } { let back: number = current; let forward: number = current; @@ -88,10 +88,10 @@ function getFullLine(source: string, current: number): { line: number; fullLine: forward++; } - const line = source.slice(0, back).split('\n').length; + const lineIndex = source.slice(0, back).split('\n').length; const fullLine = source.slice(back, forward); - return {line, fullLine}; + return {lineIndex, fullLine}; } export function createErrorIndicator(snippet: string, errorOp: string = '/'): string { @@ -109,7 +109,7 @@ export class TypeConcatenateError extends RuntimeSourceError { this.type = ErrorType.TYPE; let index = (node as any).symbol?.loc?.start?.index; - const { line, fullLine } = getFullLine(source, index); + const { lineIndex, fullLine } = getFullLine(source, index); const snippet = (node as any).symbol?.loc?.source ?? ''; let hint = 'TypeError: can only concatenate str (not "' + wrongType + '") to str.'; @@ -117,7 +117,7 @@ export class TypeConcatenateError extends RuntimeSourceError { const indicator = createErrorIndicator(snippet, '+'); const name = "TypeError"; const suggestion = "You are trying to concatenate a string with an " + wrongType + ". To fix this, convert the " + wrongType + " to a string using str(), or ensure both operands are of the same type."; - const msg = name + " at line " + line + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + "\n" + suggestion; + const msg = name + " at line " + lineIndex + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + "\n" + suggestion; this.message = msg; } } @@ -129,7 +129,7 @@ export class UnsupportedOperandTypeError extends RuntimeSourceError { let index = (node as any).symbol?.loc?.start?.index ?? (node as any).srcNode?.loc?.start?.index; - const { line, fullLine } = getFullLine(source, index); + const { lineIndex, fullLine } = getFullLine(source, index); const snippet = (node as any).symbol?.loc?.source ?? (node as any).srcNode?.loc?.source ?? ''; @@ -138,7 +138,7 @@ export class UnsupportedOperandTypeError extends RuntimeSourceError { const indicator = createErrorIndicator(snippet, operand); const name = "TypeError"; const suggestion = "You are using the '" + operand+ "' operator between a '" + wrongType1 + "' and a '" + wrongType2 + "', which are not compatible types for this operation.\nMake sure both operands are of the correct type."; - const msg = name + " at line " + line + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + "\n" + suggestion; + const msg = name + " at line " + lineIndex + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + "\n" + suggestion; this.message = msg; } } @@ -159,8 +159,8 @@ export class MissingRequiredPositionalError extends RuntimeSourceError { const index = (node as any).loc?.start?.index ?? (node as any).srcNode?.loc?.start?.index ?? 0; - const { line, fullLine } = getFullLine(source, index); - this.message = 'TypeError at line ' + line + '\n\n ' + fullLine + '\n'; + const { lineIndex, fullLine } = getFullLine(source, index); + this.message = 'TypeError at line ' + lineIndex + '\n\n ' + fullLine + '\n'; if (typeof params === 'number') { this.missingParamCnt = params; @@ -215,8 +215,8 @@ export class TooManyPositionalArgumentsError extends RuntimeSourceError { const index = (node as any).loc?.start?.index ?? (node as any).srcNode?.loc?.start?.index ?? 0; - const { line, fullLine } = getFullLine(source, index); - this.message = 'TypeError at line ' + line + '\n\n ' + fullLine + '\n'; + const { lineIndex, fullLine } = getFullLine(source, index); + this.message = 'TypeError at line ' + lineIndex + '\n\n ' + fullLine + '\n'; if (typeof params === 'number') { this.expectedCount = params; @@ -245,7 +245,7 @@ export class ZeroDivisionError extends RuntimeSourceError { super(node); this.type = ErrorType.TYPE; let index = (node as any).symbol?.loc?.start?.index; - const { line, fullLine } = getFullLine(source, index); + const { lineIndex, fullLine } = getFullLine(source, index); const snippet = (node as any).symbol?.loc?.source ?? ''; let hint = 'ZeroDivisionError: division by zero.'; @@ -253,7 +253,7 @@ export class ZeroDivisionError extends RuntimeSourceError { const indicator = createErrorIndicator(snippet, '/'); const name = "ZeroDivisionError"; const suggestion = "You attempted to divide by zero. Division or modulo operations cannot be performed with a divisor of zero. Please ensure that the divisor is non-zero before performing the operation."; - const msg = name + " at line " + line + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + "\n" + suggestion; + const msg = name + " at line " + lineIndex + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + "\n" + suggestion; this.message = msg; } } @@ -267,7 +267,7 @@ export class StepLimitExceededError extends RuntimeSourceError { ?? (node as any).srcNode?.loc?.start?.index ?? 0; - const { line, fullLine } = getFullLine(source, index); + const { lineIndex, fullLine } = getFullLine(source, index); const snippet = (node as any).loc?.source ?? (node as any).srcNode?.loc?.source @@ -282,7 +282,7 @@ export class StepLimitExceededError extends RuntimeSourceError { const adjustedOffset = offset >= 0 ? offset : 0; const msg = [ - `${name} at line ${line}`, + `${name} at line ${lineIndex}`, '', ' ' + fullLine, ' ' + ' '.repeat(adjustedOffset) + indicator, @@ -300,7 +300,7 @@ export class ValueError extends RuntimeSourceError { const index = (node as any).loc?.start?.index ?? (node as any).srcNode?.loc?.start?.index ?? 0; - const { line, fullLine } = getFullLine(source, index); + const { lineIndex, fullLine } = getFullLine(source, index); const snippet = (node as any).loc?.source ?? (node as any).srcNode?.loc?.source ?? ''; @@ -309,7 +309,7 @@ export class ValueError extends RuntimeSourceError { const indicator = createErrorIndicator(snippet, '@'); const name = "ValueError"; const suggestion = `Ensure that the input value(s) passed to '${functionName}' satisfy the mathematical requirements`; - const msg = name + " at line " + line + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + suggestion; + const msg = name + " at line " + lineIndex + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + suggestion; this.message = msg; } } @@ -322,7 +322,7 @@ export class TypeError extends RuntimeSourceError { const index = (node as any).loc?.start?.index ?? (node as any).srcNode?.loc?.start?.index ?? 0; - const { line, fullLine } = getFullLine(source, index); + const { lineIndex, fullLine } = getFullLine(source, index); const snippet = (node as any).loc?.source ?? (node as any).srcNode?.loc?.source ?? ''; @@ -331,7 +331,7 @@ export class TypeError extends RuntimeSourceError { const indicator = createErrorIndicator(snippet, '@'); const name = "TypeError"; const suggestion = ' Make sure the value you are passing is compatible with the expected type.'; - const msg = name + " at line " + line + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + suggestion; + const msg = name + " at line " + lineIndex + "\n\n " + fullLine + "\n " + " ".repeat(offset) + indicator + "\n" + hint + suggestion; this.message = msg; } } @@ -352,7 +352,7 @@ export class SublanguageError extends RuntimeSourceError { const index = (node as any).loc?.start?.index ?? (node as any).srcNode?.loc?.start?.index ?? 0 - const { line, fullLine } = getFullLine(source, index) + const { lineIndex, fullLine } = getFullLine(source, index) const snippet = (node as any).loc?.source ?? (node as any).srcNode?.loc?.source ?? '' @@ -363,6 +363,20 @@ export class SublanguageError extends RuntimeSourceError { const hint = 'Feature not supported in Python §' + chapter + '. ' const suggestion = `The call to '${functionName}()' relies on behaviour that is valid in full Python but outside the Python §1 sublanguage${details ? ': ' + details : ''}.` - this.message = `${name} at line ${line}\n\n ${fullLine}\n ${' '.repeat(offset)}${indicator}\n${hint}${suggestion}` + this.message = `${name} at line ${lineIndex}\n\n ${fullLine}\n ${' '.repeat(offset)}${indicator}\n${hint}${suggestion}` } } + +/* + The offset is calculated as follows: + Current position is one after real position of end of token: 1 +*/ +export const MAGIC_OFFSET = 1; + +export const SPECIAL_CHARS = new RegExp("[\\\\$'\"]", "g"); + +function escape(unsafe: string): string { + // @TODO escape newlines + return unsafe.replace(SPECIAL_CHARS, "\\$&"); +} + diff --git a/src/errors/index.ts b/src/errors/index.ts new file mode 100644 index 0000000..64a2a47 --- /dev/null +++ b/src/errors/index.ts @@ -0,0 +1 @@ +export * from "./errors"; \ No newline at end of file diff --git a/src/generate-ast.ts b/src/generate-ast.ts index 337cb58..02dadec 100644 --- a/src/generate-ast.ts +++ b/src/generate-ast.ts @@ -93,7 +93,7 @@ export class AstWriter { fs.writeFileSync(FILE_NAME, ""); this.writeSingleLine('// This file is autogenerated by generate-ast.ts. DO NOT EDIT THIS FILE DIRECTLY.'); // Imports - this.writeSingleLine('import {Token} from "./tokenizer";') + this.writeSingleLine('import {Token} from "./tokens";') this.writeSingleLine(''); } diff --git a/src/parser/errors.ts b/src/parser/errors.ts new file mode 100644 index 0000000..9a2474d --- /dev/null +++ b/src/parser/errors.ts @@ -0,0 +1,47 @@ +import { createErrorIndicator, getFullLine, MAGIC_OFFSET } from "../errors"; +import { Token } from "../tokenizer"; + +export namespace ParserErrors { + export class BaseParserError extends SyntaxError { + line: number; + col: number; + + constructor(message: string, line: number, col: number) { + super(`SyntaxError at line ${line} + ${message}`); + this.line = line; + this.col = col; + this.name = "BaseParserError"; + } + } + export class ExpectedTokenError extends BaseParserError { + constructor(source: string, current: Token, expected: string) { + let { lineIndex, fullLine } = getFullLine(source, current.indexInSource - current.lexeme.length); + fullLine = '\n' + fullLine + '\n'; + let hint = `^ ${expected}. Found '${current.lexeme}'.`; + hint = hint.padStart(hint.length + current.col - MAGIC_OFFSET, " "); + super(fullLine + hint, lineIndex, current.col); + this.name = "ExpectedTokenError"; + } + } + export class NoElseBlockError extends BaseParserError { + constructor(source: string, current: Token) { + let { lineIndex, fullLine } = getFullLine(source, current.indexInSource); + fullLine = '\n' + fullLine + '\n'; + let hint = `^ Expected else block after this if block.`; + hint = hint.padStart(hint.length + current.col - MAGIC_OFFSET, " "); + super(fullLine + hint, lineIndex, current.col); + this.name = "NoElseBlockError"; + } + } + export class GenericUnexpectedSyntaxError extends BaseParserError { + constructor(line: number, col: number, source: string, start: number, current: number) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = ` Detected invalid syntax.`; + const indicator = createErrorIndicator(fullLine, '@'); + super(fullLine + indicator + hint, lineIndex, col); + this.name = "GenericUnexpectedSyntaxError"; + } + } +} diff --git a/src/parser/index.ts b/src/parser/index.ts new file mode 100644 index 0000000..c278cf4 --- /dev/null +++ b/src/parser/index.ts @@ -0,0 +1 @@ +export { Parser } from "./parser"; \ No newline at end of file diff --git a/src/parser.ts b/src/parser/parser.ts similarity index 99% rename from src/parser.ts rename to src/parser/parser.ts index 5cd6058..1705f81 100644 --- a/src/parser.ts +++ b/src/parser/parser.ts @@ -39,9 +39,9 @@ IN THE SOFTWARE. **/ -import { SPECIAL_IDENTIFIER_TOKENS, Token } from "./tokenizer"; -import { TokenType } from "./tokens"; -import { ExprNS, StmtNS } from "./ast-types"; +import { Token, SPECIAL_IDENTIFIER_TOKENS } from "../tokenizer/tokenizer"; +import { TokenType } from "../tokens"; +import { ExprNS, StmtNS } from "../ast-types"; import { ParserErrors } from "./errors"; type Expr = ExprNS.Expr; diff --git a/src/resolver/errors.ts b/src/resolver/errors.ts new file mode 100644 index 0000000..434dea2 --- /dev/null +++ b/src/resolver/errors.ts @@ -0,0 +1,60 @@ +import { getFullLine, MAGIC_OFFSET } from "../errors"; +import { Token } from "../tokenizer"; + +export namespace ResolverErrors { + export class BaseResolverError extends SyntaxError { + line: number; + col: number; + + constructor(name: string, message: string, line: number, col: number) { + super(`${name} at line ${line} + ${message}`); + this.line = line; + this.col = col; + this.name = "BaseResolverError"; + } + } + export class NameNotFoundError extends BaseResolverError { + constructor(line: number, col: number, source: string, start: number, + current: number, suggestion: string | null) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = ` This name is not found in the current or enclosing environment(s).`; + const diff = (current - start); + hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "^"); + hint = hint.padStart(hint.length + col - diff, " "); + if (suggestion !== null) { + let sugg = ` Perhaps you meant to type '${suggestion}'?` + sugg = sugg.padStart(sugg.length + col - MAGIC_OFFSET + 1, " "); + sugg = '\n' + sugg; + hint += sugg; + } + const name = "NameNotFoundError"; + super(name, fullLine + hint, lineIndex, col); + this.name = "NameNotFoundError"; + } + } + + export class NameReassignmentError extends BaseResolverError { + constructor(line: number, col: number, source: string, start: number, + current: number, oldName: Token) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = ` A name has been declared here.`; + const diff = (current - start); + hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "^"); + hint = hint.padStart(hint.length + col - diff, " "); + let { lineIndex: oldLine, fullLine: oldNameLine } = getFullLine(source, oldName.indexInSource); + oldNameLine = '\n' + oldNameLine + '\n'; + let sugg = ` However, it has already been declared in the same environment at line ${oldLine}, here: ` + sugg = sugg.padStart(sugg.length + col - MAGIC_OFFSET + 1, " "); + sugg = '\n' + sugg; + hint += sugg; + oldNameLine.padStart(oldNameLine.length + col - MAGIC_OFFSET + 1, " "); + hint += oldNameLine; + const name = "NameReassignmentError"; + super(name, fullLine + hint, lineIndex, col); + this.name = "NameReassignmentError"; + } + } +} diff --git a/src/resolver/index.ts b/src/resolver/index.ts new file mode 100644 index 0000000..ef3fa35 --- /dev/null +++ b/src/resolver/index.ts @@ -0,0 +1 @@ +export { Resolver } from "./resolver"; \ No newline at end of file diff --git a/src/resolver.ts b/src/resolver/resolver.ts similarity index 99% rename from src/resolver.ts rename to src/resolver/resolver.ts index 50d4404..9adda4d 100644 --- a/src/resolver.ts +++ b/src/resolver/resolver.ts @@ -1,8 +1,8 @@ -import { StmtNS, ExprNS } from "./ast-types"; +import { StmtNS, ExprNS } from "../ast-types"; type Expr = ExprNS.Expr; type Stmt = StmtNS.Stmt; -import { Token } from "./tokenizer"; -import { TokenType } from "./tokens"; +import { Token } from "../tokenizer/tokenizer"; +import { TokenType } from "../tokens"; import { ResolverErrors } from "./errors"; import levenshtein from 'fast-levenshtein'; diff --git a/src/tokenizer/errors.ts b/src/tokenizer/errors.ts new file mode 100644 index 0000000..a3b20e4 --- /dev/null +++ b/src/tokenizer/errors.ts @@ -0,0 +1,112 @@ +import { getFullLine, MAGIC_OFFSET } from "../errors"; + +export namespace TokenizerErrors { + export class BaseTokenizerError extends SyntaxError { + line: number; + col: number; + + constructor(message: string, line: number, col: number) { + super(`SyntaxError at line ${line} column ${col-1} + ${message}`); + this.line = line; + this.col = col; + this.name = "BaseTokenizerError"; + } + } + + export class UnknownTokenError extends BaseTokenizerError { + constructor(token: string, line: number, col: number, source: string, current: number) { + let { lineIndex, fullLine } = getFullLine(source, current-1); + fullLine = '\n' + fullLine + '\n'; + let hint = `${col > 1 ? '~' : ''}^~ Unknown token '${escape(token)}'`; + // The extra `~` character takes up some space. + hint = hint.padStart(hint.length + col - MAGIC_OFFSET - (col > 1 ? 1 : 0), " "); + super(fullLine + hint, lineIndex, col); + this.name = "UnknownTokenError"; + } + } + + export class UnterminatedStringError extends BaseTokenizerError { + constructor(line: number, col: number, source: string, start: number, current: number) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = `^ Unterminated string`; + const diff = (current - start); + // +1 because we want the arrow to point after the string (where we expect the closing ") + hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "~"); + hint = hint.padStart(hint.length + col - diff, " "); + super(fullLine + hint, lineIndex, col); + this.name = "UnterminatedStringError"; + } + } + + export class NonFourIndentError extends BaseTokenizerError { + constructor(line: number, col: number, source: string, start: number) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = `^ This indent should be a multiple of 4 spaces. It's currently ${col} spaces.`; + hint = hint.padStart(hint.length + col - MAGIC_OFFSET, "-"); + super(fullLine + hint, lineIndex, col); + this.name = "NonFourIndentError"; + } + } + + export class InvalidNumberError extends BaseTokenizerError { + constructor(line: number, col: number, source: string, start: number, current: number) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = `^ Invalid Number input.`; + const diff = (current - start); + // +1 because we want the arrow to point after the string (where we expect the closing ") + hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "~"); + hint = hint.padStart(hint.length + col - diff, " "); + super(fullLine + hint, lineIndex, col); + this.name = "InvalidNumberError"; + } + } + + export class InconsistentIndentError extends BaseTokenizerError { + constructor(line: number, col: number, source: string, start: number) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = `^ This indent/dedent is inconsistent with other indents/dedents. It's currently ${col} spaces.`; + hint = hint.padStart(hint.length + col - MAGIC_OFFSET, "-"); + super(fullLine + hint, lineIndex, col); + this.name = "InconsistentIndentError"; + } + } + export class ForbiddenIdentifierError extends BaseTokenizerError { + constructor(line: number, col: number, source: string, start: number) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = `^ This identifier is reserved for use in Python. Consider using another identifier.`; + hint = hint.padStart(hint.length + col - MAGIC_OFFSET, "^"); + super(fullLine + hint, lineIndex, col); + this.name = "ForbiddenIdentifierError"; + } + } + export class ForbiddenOperatorError extends BaseTokenizerError { + constructor(line: number, col: number, source: string, start: number, current: number) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = ` This operator is reserved for use in Python. It's not allowed to be used.`; + const diff = (current - start); + hint = hint.padStart(hint.length + diff - MAGIC_OFFSET + 1, "^"); + hint = hint.padStart(hint.length + col - diff, " "); + super(fullLine + hint, lineIndex, col); + this.name = "ForbiddenOperatorError"; + } + } + + export class NonMatchingParenthesesError extends BaseTokenizerError { + constructor(line: number, col: number, source: string, current: number) { + let { lineIndex, fullLine } = getFullLine(source, current-1); + fullLine = '\n' + fullLine + '\n'; + let hint = `${col > 1 ? '~' : ''}^~ Non-matching closing parentheses.`; + // The extra `~` character takes up some space. + hint = hint.padStart(hint.length + col - MAGIC_OFFSET - (col > 1 ? 1 : 0), " "); + super(fullLine + hint, lineIndex, col); + this.name = "NonMatchingParenthesesError"; + } + } +} diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts new file mode 100644 index 0000000..d18f373 --- /dev/null +++ b/src/tokenizer/index.ts @@ -0,0 +1 @@ +export { Token, Tokenizer } from "./tokenizer"; \ No newline at end of file diff --git a/src/tokenizer.ts b/src/tokenizer/tokenizer.ts similarity index 99% rename from src/tokenizer.ts rename to src/tokenizer/tokenizer.ts index 6600dba..164ada1 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer/tokenizer.ts @@ -40,25 +40,9 @@ IN THE SOFTWARE. * */ -import { TokenType } from "./tokens"; +import { TokenType } from "../tokens"; import { TokenizerErrors } from "./errors"; -export class Token { - type: TokenType; - lexeme: string; - line: number; - col: number; - indexInSource: number; - - constructor(type: TokenType, lexeme: string, line: number, col: number, indexInSource: number) { - this.type = type; - this.lexeme = lexeme; - this.line = line; - this.col = col; - this.indexInSource = indexInSource - } -} - const specialIdentifiers = new Map([ ["and", TokenType.AND], ["or", TokenType.OR], @@ -88,6 +72,21 @@ const specialIdentifiers = new Map([ export const SPECIAL_IDENTIFIER_TOKENS = Array.from(specialIdentifiers.values()); +export class Token { + type: TokenType; + lexeme: string; + line: number; + col: number; + indexInSource: number; + + constructor(type: TokenType, lexeme: string, line: number, col: number, indexInSource: number) { + this.type = type; + this.lexeme = lexeme; + this.line = line; + this.col = col; + this.indexInSource = indexInSource + } +} export class Tokenizer { private readonly source: string; diff --git a/src/translator/errors.ts b/src/translator/errors.ts new file mode 100644 index 0000000..6b93993 --- /dev/null +++ b/src/translator/errors.ts @@ -0,0 +1,26 @@ +import { getFullLine, MAGIC_OFFSET } from "../errors"; + +export namespace TranslatorErrors { + export class BaseTranslatorError extends SyntaxError { + line: number; + col: number; + + constructor(message: string, line: number, col: number) { + super(`BaseTranslatorError at line ${line} column ${col-1} + ${message}`); + this.line = line; + this.col = col; + this.name = "BaseTranslatorError"; + } + } + export class UnsupportedOperator extends BaseTranslatorError { + constructor(line: number, col: number, source: string, start: number) { + let { lineIndex, fullLine } = getFullLine(source, start); + fullLine = '\n' + fullLine + '\n'; + let hint = `^ This operator is not yet supported by us.`; + hint = hint.padStart(hint.length + col - MAGIC_OFFSET, " "); + super(fullLine + hint, lineIndex, col); + this.name = "UnsupportedOperator"; + } + } +} \ No newline at end of file diff --git a/src/translator/index.ts b/src/translator/index.ts new file mode 100644 index 0000000..98c47bf --- /dev/null +++ b/src/translator/index.ts @@ -0,0 +1 @@ +export { Translator } from "./translator"; \ No newline at end of file diff --git a/src/translator.ts b/src/translator/translator.ts similarity index 99% rename from src/translator.ts rename to src/translator/translator.ts index f7a7d24..8700a57 100644 --- a/src/translator.ts +++ b/src/translator/translator.ts @@ -2,12 +2,12 @@ * Translate our AST to estree AST (Source's AST) * */ -import { StmtNS, ExprNS } from "./ast-types"; +import { StmtNS, ExprNS } from "../ast-types"; type Expr = ExprNS.Expr; type Stmt = StmtNS.Stmt; -import { Token } from "./tokenizer"; -import { TokenType } from "./tokens"; +import { Token } from "../tokenizer"; +import { TokenType } from "../tokens"; import { ArrowFunctionExpression, @@ -41,7 +41,7 @@ import { WhileStatement } from "estree"; import { TranslatorErrors } from "./errors"; -import { ComplexLiteral, None } from "./types"; +import { ComplexLiteral, None } from "../types"; // import { isEmpty } from "lodash"; export interface EstreePosition { From 0b277b4a6c67e90ebd1c16840f2da302d8eb3533 Mon Sep 17 00:00:00 2001 From: loyaltypollution <65063925+loyaltypollution@users.noreply.github.com> Date: Wed, 29 Oct 2025 01:46:38 +0800 Subject: [PATCH 2/4] Implemented Nearley parsing --- package-lock.json | 151 +++++++++ package.json | 7 + src/nearley/lexer.ts | 262 +++++++++++++++ src/nearley/parser-adapter.ts | 109 ++++++ src/nearley/python-grammar.ts | 514 ++++++++++++++++++++++++++++ src/nearley/python.ne | 612 ++++++++++++++++++++++++++++++++++ src/runner/pyRunner.ts | 10 +- 7 files changed, 1661 insertions(+), 4 deletions(-) create mode 100644 src/nearley/lexer.ts create mode 100644 src/nearley/parser-adapter.ts create mode 100644 src/nearley/python-grammar.ts create mode 100644 src/nearley/python.ne diff --git a/package-lock.json b/package-lock.json index e92bcb6..bf7f323 100644 --- a/package-lock.json +++ b/package-lock.json @@ -23,11 +23,16 @@ "@types/fast-levenshtein": "^0.0.4", "@types/jest": "^29.4.0", "@types/mathjs": "^9.4.1", + "@types/moo": "^0.5.10", + "@types/nearley": "^2.11.5", "@types/node": "^18.19.84", "glob": "^11.0.1", "jest": "^29.7.0", "jsdoc": "^4.0.4", + "moo": "^0.5.2", + "nearley": "^2.20.1", "nodemon": "^3.1.10", + "peggy": "^5.0.6", "rimraf": "^3.0.2", "rollup": "^4.38.0", "rollup-plugin-modify": "^3.0.0", @@ -1085,6 +1090,32 @@ "node": ">=v12.0.0" } }, + "node_modules/@peggyjs/from-mem": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/@peggyjs/from-mem/-/from-mem-3.1.1.tgz", + "integrity": "sha512-m5OEjgJaePWpyNtQCvRZkpLoV+z44eh6QIO9yEwQuOThdUdkECO3wcKLT3tFA3H8WM5bxU/K/dpmo7r/X16UEw==", + "dev": true, + "license": "MIT", + "dependencies": { + "semver": "7.7.2" + }, + "engines": { + "node": ">=20.8" + } + }, + "node_modules/@peggyjs/from-mem/node_modules/semver": { + "version": "7.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", + "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/@rollup/plugin-commonjs": { "version": "28.0.6", "resolved": "https://registry.npmjs.org/@rollup/plugin-commonjs/-/plugin-commonjs-28.0.6.tgz", @@ -1713,6 +1744,20 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/moo": { + "version": "0.5.10", + "resolved": "https://registry.npmjs.org/@types/moo/-/moo-0.5.10.tgz", + "integrity": "sha512-W6KzyZjXUYpwQfLK1O1UDzqcqYlul+lO7Bt71luyIIyNlOZwJaNeWWdqFs1C/f2hohZvUFHMk6oFNe9Rg48DbA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/nearley": { + "version": "2.11.5", + "resolved": "https://registry.npmjs.org/@types/nearley/-/nearley-2.11.5.tgz", + "integrity": "sha512-dM7TrN0bVxGGXTYGx4YhGear8ysLO5SOuouAWM9oltjQ3m9oYa13qi8Z1DJp5zxVMPukvQdsrnZmgzpeuTSEQA==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/node": { "version": "18.19.127", "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.127.tgz", @@ -2508,6 +2553,13 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/discontinuous-range": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/discontinuous-range/-/discontinuous-range-1.0.0.tgz", + "integrity": "sha512-c68LpLbO+7kP/b1Hr1qs8/BJ09F5khZGTxqxZuhzxpmwJKOgRFHJWIb9/KmqnqHhLdO55aOxFH/EGBvUQbL/RQ==", + "dev": true, + "license": "MIT" + }, "node_modules/eastasianwidth": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", @@ -4333,6 +4385,13 @@ "node": ">=10" } }, + "node_modules/moo": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/moo/-/moo-0.5.2.tgz", + "integrity": "sha512-iSAJLHYKnX41mKcJKjqvnAN9sf0LMDTXDEvFv+ffuRR9a1MIuXLjMNL6EsnDHSkKLTWNqQQ5uo61P4EbU4NU+Q==", + "dev": true, + "license": "BSD-3-Clause" + }, "node_modules/ms": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", @@ -4347,6 +4406,29 @@ "dev": true, "license": "MIT" }, + "node_modules/nearley": { + "version": "2.20.1", + "resolved": "https://registry.npmjs.org/nearley/-/nearley-2.20.1.tgz", + "integrity": "sha512-+Mc8UaAebFzgV+KpI5n7DasuuQCHA89dmwm7JXw3TV43ukfNQ9DnBH3Mdb2g/I4Fdxc26pwimBWvjIw0UAILSQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "commander": "^2.19.0", + "moo": "^0.5.0", + "railroad-diagrams": "^1.0.0", + "randexp": "0.4.6" + }, + "bin": { + "nearley-railroad": "bin/nearley-railroad.js", + "nearley-test": "bin/nearley-test.js", + "nearley-unparse": "bin/nearley-unparse.js", + "nearleyc": "bin/nearleyc.js" + }, + "funding": { + "type": "individual", + "url": "https://nearley.js.org/#give-to-nearley" + } + }, "node_modules/neo-async": { "version": "2.6.2", "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz", @@ -4688,6 +4770,34 @@ "node": "20 || >=22" } }, + "node_modules/peggy": { + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/peggy/-/peggy-5.0.6.tgz", + "integrity": "sha512-Sud8Zus0JAgE+U4zwkJv29OOaXhviFI7J90/6cGfy3OoqR8dpnieeF9a46dj0bTtqiFnrFatldA6ltQyOJvNmg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peggyjs/from-mem": "3.1.1", + "commander": "^14.0.0", + "source-map-generator": "2.0.2" + }, + "bin": { + "peggy": "bin/peggy.js" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/peggy/node_modules/commander": { + "version": "14.0.2", + "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.2.tgz", + "integrity": "sha512-TywoWNNRbhoD0BXs1P3ZEScW8W5iKrnbithIl0YH+uCmBd0QpPOA8yc82DS3BIE5Ma6FnBVUsJ7wVUDz4dvOWQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=20" + } + }, "node_modules/picocolors": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", @@ -4807,6 +4917,27 @@ ], "license": "MIT" }, + "node_modules/railroad-diagrams": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/railroad-diagrams/-/railroad-diagrams-1.0.0.tgz", + "integrity": "sha512-cz93DjNeLY0idrCNOH6PviZGRN9GJhsdm9hpn1YCS879fj4W+x5IFJhhkRZcwVgMmFF7R82UA/7Oh+R8lLZg6A==", + "dev": true, + "license": "CC0-1.0" + }, + "node_modules/randexp": { + "version": "0.4.6", + "resolved": "https://registry.npmjs.org/randexp/-/randexp-0.4.6.tgz", + "integrity": "sha512-80WNmd9DA0tmZrw9qQa62GPPWfuXJknrmVmLcxvq4uZBdYqb1wYoKTmnlGUchvVWe0XiLupYkBoXVOxz3C8DYQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "discontinuous-range": "1.0.0", + "ret": "~0.1.10" + }, + "engines": { + "node": ">=0.12" + } + }, "node_modules/randombytes": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", @@ -4924,6 +5055,16 @@ "node": ">=10" } }, + "node_modules/ret": { + "version": "0.1.15", + "resolved": "https://registry.npmjs.org/ret/-/ret-0.1.15.tgz", + "integrity": "sha512-TTlYpa+OL+vMMNG24xSlQGEJ3B/RzEfUlLct7b5G/ytav+wPrplCpVMFuwzXbkecJrb6IYo1iFb0S9v37754mg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.12" + } + }, "node_modules/rimraf": { "version": "3.0.2", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", @@ -5180,6 +5321,16 @@ "node": ">=0.10.0" } }, + "node_modules/source-map-generator": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/source-map-generator/-/source-map-generator-2.0.2.tgz", + "integrity": "sha512-unCl5BQhF/us51DiT7SvlSY3QUPhyfAdHJxd8l7FXdwzqxli0UDMV2dEuei2SeGp3Z4rB/AJ9zKi1mGOp2K2ww==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=20" + } + }, "node_modules/source-map-support": { "version": "0.5.13", "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.13.tgz", diff --git a/package.json b/package.json index 29e08a8..0686ada 100644 --- a/package.json +++ b/package.json @@ -4,8 +4,10 @@ "description": "", "main": "dist/index.js", "types": "dist/index.d.ts", + "type": "module", "scripts": { "regen": "npm run build && node dist/generate.js", + "compile-grammar": "nearleyc src/nearley/python.ne -o src/nearley/python-grammar.ts", "start:dev": "npx nodemon", "build": "rollup -c --bundleConfigAsCjs", "start": "npm run build && node dist/index.js", @@ -31,11 +33,16 @@ "@types/fast-levenshtein": "^0.0.4", "@types/jest": "^29.4.0", "@types/mathjs": "^9.4.1", + "@types/moo": "^0.5.10", + "@types/nearley": "^2.11.5", "@types/node": "^18.19.84", "glob": "^11.0.1", "jest": "^29.7.0", "jsdoc": "^4.0.4", + "moo": "^0.5.2", + "nearley": "^2.20.1", "nodemon": "^3.1.10", + "peggy": "^5.0.6", "rimraf": "^3.0.2", "rollup": "^4.38.0", "rollup-plugin-modify": "^3.0.0", diff --git a/src/nearley/lexer.ts b/src/nearley/lexer.ts new file mode 100644 index 0000000..51c0f99 --- /dev/null +++ b/src/nearley/lexer.ts @@ -0,0 +1,262 @@ +/** + * Moo lexer configuration for Python subset + * This replaces the hand-written tokenizer + */ + +import moo from 'moo'; + +// Track indentation state +let indentStack: number[] = [0]; + +export const lexer = moo.compile({ + // Whitespace and line handling + newline: { match: /\n/, lineBreaks: true }, + ws: { match: /[ \t]+/ }, + + // Comments + comment: /#[^\n]*/, + + // Numbers + complex: /(?:\d+\.?\d*|\.\d+)[jJ]/, + bigint: /\d+[nN]/, + float: /(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?/, + integer: /\d+/, + + // Strings (simplified - doesn't handle all edge cases yet) + stringTripleDouble: /"""(?:[^"\\]|\\["\\/bfnrt]|\\u[a-fA-F0-9]{4})*?"""/, + stringTripleSingle: /'''(?:[^'\\]|\\['\\/bfnrt]|\\u[a-fA-F0-9]{4})*?'''/, + stringDouble: /"(?:[^"\\]|\\["\\/bfnrt]|\\u[a-fA-F0-9]{4})*?"/, + stringSingle: /'(?:[^'\\]|\\['\\/bfnrt]|\\u[a-fA-F0-9]{4})*?'/, + + // Keywords (must come before identifier) + kw_def: 'def', + kw_if: 'if', + kw_elif: 'elif', + kw_else: 'else', + kw_while: 'while', + kw_for: 'for', + kw_in: 'in', + kw_return: 'return', + kw_pass: 'pass', + kw_break: 'break', + kw_continue: 'continue', + kw_and: 'and', + kw_or: 'or', + kw_not: 'not', + kw_is: 'is', + kw_lambda: 'lambda', + kw_from: 'from', + kw_import: 'import', + kw_global: 'global', + kw_nonlocal: 'nonlocal', + kw_assert: 'assert', + kw_True: 'True', + kw_False: 'False', + kw_None: 'None', + + // Forbidden keywords (for error reporting) + forbidden_async: 'async', + forbidden_await: 'await', + forbidden_yield: 'yield', + forbidden_with: 'with', + forbidden_del: 'del', + forbidden_try: 'try', + forbidden_except: 'except', + forbidden_finally: 'finally', + forbidden_raise: 'raise', + forbidden_class: 'class', + + // Multi-character operators + doublestar: '**', + doubleslash: '//', + doubleequal: '==', + notequal: '!=', + lessequal: '<=', + greaterequal: '>=', + doublecolon: '::', + ellipsis: '...', + + // Single character operators and delimiters + lparen: '(', + rparen: ')', + lsqb: '[', + rsqb: ']', + lbrace: '{', + rbrace: '}', + colon: ':', + comma: ',', + plus: '+', + minus: '-', + star: '*', + slash: '/', + percent: '%', + less: '<', + greater: '>', + equal: '=', + dot: '.', + semi: ';', + + // Identifiers (must come after keywords) + identifier: /[a-zA-Z_][a-zA-Z0-9_]*/, +}); + +// Custom token type for indentation handling +export interface IndentToken { + type: 'INDENT' | 'DEDENT'; + value: string; + line: number; + col: number; +} + +/** + * Wrapper around Moo lexer that handles Python indentation + * This processes the token stream to inject INDENT/DEDENT tokens + */ +export class PythonLexer { + private lexer: moo.Lexer; + private tokenQueue: (moo.Token | IndentToken)[] = []; + private indentStack: number[] = [0]; + private atLineStart: boolean = true; + private pendingIndentation: number | null = null; + private currentLineIndent: number = 0; + private seenFirstToken: boolean = false; + + constructor() { + this.lexer = lexer.reset(); + } + + reset(chunk?: string, state?: any) { + // Add trailing newline if missing + if (chunk && !chunk.endsWith('\n')) { + chunk = chunk + '\n'; + } + this.lexer.reset(chunk, state); + this.tokenQueue = []; + this.indentStack = [0]; + this.atLineStart = true; + this.pendingIndentation = null; + this.currentLineIndent = 0; + this.seenFirstToken = false; + return this; + } + + next(): moo.Token | IndentToken | undefined { + // Return queued tokens first + if (this.tokenQueue.length > 0) { + return this.tokenQueue.shift(); + } + + const token = this.lexer.next(); + + if (!token) { + // End of input - emit remaining DEDENTs, then EOF, then undefined + if (this.indentStack.length > 1) { + this.indentStack.pop(); + return { + type: 'DEDENT', + value: '', + line: 0, + col: 0 + }; + } + // Return EOF token once, then undefined + if (this.indentStack.length === 1) { + this.indentStack.pop(); // Mark that we've returned EOF + return { + type: 'EOF', + value: '', + line: 0, + col: 0 + } as any; + } + // After EOF, return undefined + return undefined; + } + + // Skip comments but NOT whitespace initially (we need to measure it) + if (token.type === 'comment') { + return this.next(); + } + + // Handle newlines and indentation + if (token.type === 'newline') { + this.atLineStart = true; + this.currentLineIndent = 0; + return token; + } + + // At the start of a line, measure indentation + if (this.atLineStart) { + // Consume whitespace at line start to measure indentation + if (token.type === 'ws') { + this.currentLineIndent = token.value.length; + return this.next(); // Skip the whitespace, continue to next token + } + + // Now we have a real token (not whitespace), handle indentation + this.atLineStart = false; + const previousIndent = this.indentStack[this.indentStack.length - 1]; + + if (this.currentLineIndent > previousIndent) { + this.indentStack.push(this.currentLineIndent); + this.tokenQueue.push(token); + return { + type: 'INDENT', + value: '', + line: token.line!, + col: token.col! + }; + } else if (this.currentLineIndent < previousIndent) { + // Emit DEDENTs + while (this.indentStack.length > 1 && this.indentStack[this.indentStack.length - 1] > this.currentLineIndent) { + this.indentStack.pop(); + this.tokenQueue.push({ + type: 'DEDENT', + value: '', + line: token.line!, + col: token.col! + }); + } + this.tokenQueue.push(token); + return this.tokenQueue.shift(); + } + // Reset for next line + this.currentLineIndent = 0; + } + + // Skip whitespace outside of line starts (it's just spacing) + if (token.type === 'ws') { + return this.next(); + } + + // Check for forbidden keywords + if (token.type && token.type.startsWith('forbidden_')) { + throw new Error(`Forbidden keyword: ${token.value} at line ${token.line}`); + } + + return token; + } + + save() { + return { + lexerState: this.lexer.save(), + indentStack: [...this.indentStack], + atLineStart: this.atLineStart + }; + } + + formatError(token: moo.Token, message?: string) { + return this.lexer.formatError(token, message); + } + + has(tokenType: string) { + // Special tokens produced by the wrapper + if (tokenType === 'INDENT' || tokenType === 'DEDENT' || tokenType === 'EOF') { + return true; + } + return this.lexer.has(tokenType); + } +} + +// Export a factory function for Nearley +export const pythonLexer = new PythonLexer(); \ No newline at end of file diff --git a/src/nearley/parser-adapter.ts b/src/nearley/parser-adapter.ts new file mode 100644 index 0000000..c033af4 --- /dev/null +++ b/src/nearley/parser-adapter.ts @@ -0,0 +1,109 @@ +/** + * Adapter for Nearley parser to match the interface of the old hand-written parser + */ + +import nearley from 'nearley'; +import { StmtNS } from '../ast-types'; + +// Import the compiled grammar +// This will be generated by nearleyc +import grammar from './python-grammar'; + +/** + * NearleyParser - Drop-in replacement for the old Parser class + */ +export class NearleyParser { + private readonly source: string; + + constructor(source: string, tokens?: any[]) { + // Note: Nearley doesn't use pre-tokenized input in the same way + // The lexer is integrated into the parser + this.source = source; + } + + /** + * Parse the source code and return the AST + */ + parse(): StmtNS.Stmt { + // Create a new parser instance with our grammar + const parser = new nearley.Parser(nearley.Grammar.fromCompiled(grammar)); + + try { + // Feed the source code to the parser + parser.feed(this.source); + + // Check if we got results + if (parser.results.length === 0) { + throw new Error('Unexpected end of input - no parse results'); + } + + // Check for ambiguous grammar (multiple parse trees) + if (parser.results.length > 1) { + console.warn(`Ambiguous grammar: ${parser.results.length} possible parses`); + } + + // Return the first (or only) parse result + return parser.results[0]; + + } catch (error: any) { + // Transform Nearley errors to match our error format + if (error.token) { + const token = error.token; + const line = token.line || 0; + const col = token.col || 0; + throw new ParseError( + `Unexpected token: ${token.value || token.type} at line ${line}, column ${col}`, + line, + col, + this.source + ); + } + throw error; + } + } +} + +/** + * Error class for parse errors + */ +export class ParseError extends SyntaxError { + line: number; + col: number; + source: string; + + constructor(message: string, line: number, col: number, source: string) { + super(message); + this.name = 'ParseError'; + this.line = line; + this.col = col; + this.source = source; + } +} + +/** + * Convenience function to parse Python source code + */ +export function parse(source: string): StmtNS.Stmt { + const parser = new NearleyParser(source); + return parser.parse(); +} + +/** + * Function to test the parser with sample code + */ +export function testParser(code: string): void { + console.log('Parsing code:'); + console.log(code); + console.log('\n--- AST ---'); + try { + const ast = parse(code); + console.log(JSON.stringify(ast, null, 2)); + console.log('\nParse successful!'); + } catch (error: any) { + console.error('Parse error:', error.message); + if (error.token) { + console.error('Token:', error.token); + } + } +} + diff --git a/src/nearley/python-grammar.ts b/src/nearley/python-grammar.ts new file mode 100644 index 0000000..a5ffe7e --- /dev/null +++ b/src/nearley/python-grammar.ts @@ -0,0 +1,514 @@ +// Generated automatically by nearley, version 2.20.1 +// http://github.com/Hardmath123/nearley +function id(x) { return x[0]; } + +import { pythonLexer } from './lexer'; +import { ExprNS, StmtNS } from '../ast-types'; +import { Token as AstToken } from '../tokenizer/tokenizer'; +import { TokenType } from '../tokens'; + +const tokenTypeMap: { [key: string]: TokenType } = { + 'identifier': TokenType.NAME, + 'integer': TokenType.NUMBER, + 'float': TokenType.NUMBER, + 'bigint': TokenType.BIGINT, + 'complex': TokenType.COMPLEX, + 'stringTripleDouble': TokenType.STRING, + 'stringTripleSingle': TokenType.STRING, + 'stringDouble': TokenType.STRING, + 'stringSingle': TokenType.STRING, + + 'kw_def': TokenType.DEF, + 'kw_if': TokenType.IF, + 'kw_elif': TokenType.ELIF, + 'kw_else': TokenType.ELSE, + 'kw_while': TokenType.WHILE, + 'kw_for': TokenType.FOR, + 'kw_in': TokenType.IN, + 'kw_return': TokenType.RETURN, + 'kw_pass': TokenType.PASS, + 'kw_break': TokenType.BREAK, + 'kw_continue': TokenType.CONTINUE, + 'kw_and': TokenType.AND, + 'kw_or': TokenType.OR, + 'kw_not': TokenType.NOT, + 'kw_is': TokenType.IS, + 'kw_lambda': TokenType.LAMBDA, + 'kw_from': TokenType.FROM, + 'kw_import': TokenType.IMPORT, + 'kw_global': TokenType.GLOBAL, + 'kw_nonlocal': TokenType.NONLOCAL, + 'kw_assert': TokenType.ASSERT, + 'kw_True': TokenType.TRUE, + 'kw_False': TokenType.FALSE, + 'kw_None': TokenType.NONE, + + 'doublestar': TokenType.DOUBLESTAR, + 'doubleslash': TokenType.DOUBLESLASH, + 'doubleequal': TokenType.DOUBLEEQUAL, + 'notequal': TokenType.NOTEQUAL, + 'lessequal': TokenType.LESSEQUAL, + 'greaterequal': TokenType.GREATEREQUAL, + 'doublecolon': TokenType.DOUBLECOLON, + 'ellipsis': TokenType.ELLIPSIS, + + 'lparen': TokenType.LPAR, + 'rparen': TokenType.RPAR, + 'lsqb': TokenType.LSQB, + 'rsqb': TokenType.RSQB, + 'colon': TokenType.COLON, + 'comma': TokenType.COMMA, + 'plus': TokenType.PLUS, + 'minus': TokenType.MINUS, + 'star': TokenType.STAR, + 'slash': TokenType.SLASH, + 'percent': TokenType.PERCENT, + 'less': TokenType.LESS, + 'greater': TokenType.GREATER, + 'equal': TokenType.EQUAL, + 'dot': TokenType.DOT, + 'semi': TokenType.SEMI, + 'lbrace': TokenType.LBRACE, + 'rbrace': TokenType.RBRACE, + + 'INDENT': TokenType.INDENT, + 'DEDENT': TokenType.DEDENT, + 'newline': TokenType.NEWLINE, + 'EOF': TokenType.ENDMARKER, + 'NOTIN': TokenType.NOTIN, + 'ISNOT': TokenType.ISNOT, +}; + + +// Helper to convert moo tokens to AST tokens +function toAstToken(token: any): AstToken { + const type = tokenTypeMap[token.type] || TokenType.NAME; + return new AstToken( + type, + token.value, + token.line || 0, + token.col || 0, + token.offset || 0 + ); +} + +// Helper to get token type string +function tokenType(d: any, index: number): string { + return d[index]?.type || ''; +} +let Lexer = pythonLexer; +let ParserRules = [ + {"name": "file_input", "symbols": ["_", "statements", "_", (pythonLexer.has("EOF") ? {type: "EOF"} : EOF)], "postprocess": + (d) => { + const startToken = d[1][0]?.startToken || toAstToken({ type: 'ENDMARKER', value: '', line: 0, col: 0, offset: 0 }); + const endToken = d[1][d[1].length - 1]?.endToken || startToken; + return new StmtNS.FileInput(startToken, endToken, d[1], []); + } + }, + {"name": "statements", "symbols": [], "postprocess": () => []}, + {"name": "statements", "symbols": ["statements", "statement"], "postprocess": (d) => d[0].concat([d[1]])}, + {"name": "statements", "symbols": ["statements", (pythonLexer.has("newline") ? {type: "newline"} : newline)], "postprocess": (d) => d[0]}, + {"name": "statement", "symbols": ["simple_stmt"], "postprocess": id}, + {"name": "statement", "symbols": ["compound_stmt"], "postprocess": id}, + {"name": "simple_stmt", "symbols": ["small_stmt", (pythonLexer.has("newline") ? {type: "newline"} : newline)], "postprocess": (d) => d[0]}, + {"name": "small_stmt", "symbols": ["assign_stmt"], "postprocess": id}, + {"name": "small_stmt", "symbols": ["pass_stmt"], "postprocess": id}, + {"name": "small_stmt", "symbols": ["flow_stmt"], "postprocess": id}, + {"name": "small_stmt", "symbols": ["import_stmt"], "postprocess": id}, + {"name": "small_stmt", "symbols": ["global_stmt"], "postprocess": id}, + {"name": "small_stmt", "symbols": ["nonlocal_stmt"], "postprocess": id}, + {"name": "small_stmt", "symbols": ["assert_stmt"], "postprocess": id}, + {"name": "small_stmt", "symbols": ["expr_stmt"], "postprocess": id}, + {"name": "assign_stmt", "symbols": [(pythonLexer.has("identifier") ? {type: "identifier"} : identifier), "_", {"literal":":"}, "_", "test", "_", {"literal":"="}, "_", "test"], "postprocess": + (d) => { + const name = toAstToken(d[0]); + const value = d[8]; + const ann = d[4]; + return new StmtNS.AnnAssign(name, value.endToken, name, value, ann); + } + }, + {"name": "assign_stmt", "symbols": [(pythonLexer.has("identifier") ? {type: "identifier"} : identifier), "_", {"literal":":"}, "_", "test"], "postprocess": + (d) => { + const name = toAstToken(d[0]); + const ann = d[4]; + const value = new ExprNS.None(name, name, "None"); + return new StmtNS.AnnAssign(name, ann.endToken, name, value, ann); + } + }, + {"name": "assign_stmt", "symbols": [(pythonLexer.has("identifier") ? {type: "identifier"} : identifier), "_", {"literal":"="}, "_", "test"], "postprocess": + (d) => { + const name = toAstToken(d[0]); + const value = d[4]; + return new StmtNS.Assign(name, value.endToken, name, value); + } + }, + {"name": "pass_stmt", "symbols": [(pythonLexer.has("kw_pass") ? {type: "kw_pass"} : kw_pass)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Pass(token, token); + } + }, + {"name": "flow_stmt", "symbols": ["break_stmt"], "postprocess": id}, + {"name": "flow_stmt", "symbols": ["continue_stmt"], "postprocess": id}, + {"name": "flow_stmt", "symbols": ["return_stmt"], "postprocess": id}, + {"name": "break_stmt", "symbols": [(pythonLexer.has("kw_break") ? {type: "kw_break"} : kw_break)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Break(token, token); + } + }, + {"name": "continue_stmt", "symbols": [(pythonLexer.has("kw_continue") ? {type: "kw_continue"} : kw_continue)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Continue(token, token); + } + }, + {"name": "return_stmt", "symbols": [(pythonLexer.has("kw_return") ? {type: "kw_return"} : kw_return), "_", "test"], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Return(token, d[2].endToken, d[2]); + } + }, + {"name": "return_stmt", "symbols": [(pythonLexer.has("kw_return") ? {type: "kw_return"} : kw_return)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Return(token, token, null); + } + }, + {"name": "import_stmt", "symbols": [(pythonLexer.has("kw_from") ? {type: "kw_from"} : kw_from), "_", (pythonLexer.has("identifier") ? {type: "identifier"} : identifier), "_", (pythonLexer.has("kw_import") ? {type: "kw_import"} : kw_import), "_", "import_names"], "postprocess": + (d) => { + const fromToken = toAstToken(d[0]); + const module = toAstToken(d[2]); + const names = d[6]; + return new StmtNS.FromImport(fromToken, names[names.length - 1], module, names); + } + }, + {"name": "import_names", "symbols": [(pythonLexer.has("identifier") ? {type: "identifier"} : identifier)], "postprocess": (d) => [toAstToken(d[0])]}, + {"name": "import_names", "symbols": [{"literal":"("}, "_", "name_list", "_", {"literal":")"}], "postprocess": (d) => d[2]}, + {"name": "name_list", "symbols": [(pythonLexer.has("identifier") ? {type: "identifier"} : identifier)], "postprocess": (d) => [toAstToken(d[0])]}, + {"name": "name_list", "symbols": ["name_list", "_", {"literal":","}, "_", (pythonLexer.has("identifier") ? {type: "identifier"} : identifier)], "postprocess": (d) => d[0].concat([toAstToken(d[4])])}, + {"name": "global_stmt", "symbols": [(pythonLexer.has("kw_global") ? {type: "kw_global"} : kw_global), "_", (pythonLexer.has("identifier") ? {type: "identifier"} : identifier)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + const name = toAstToken(d[2]); + return new StmtNS.Global(token, name, name); + } + }, + {"name": "nonlocal_stmt", "symbols": [(pythonLexer.has("kw_nonlocal") ? {type: "kw_nonlocal"} : kw_nonlocal), "_", (pythonLexer.has("identifier") ? {type: "identifier"} : identifier)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + const name = toAstToken(d[2]); + return new StmtNS.NonLocal(token, name, name); + } + }, + {"name": "assert_stmt", "symbols": [(pythonLexer.has("kw_assert") ? {type: "kw_assert"} : kw_assert), "_", "test"], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Assert(token, d[2].endToken, d[2]); + } + }, + {"name": "expr_stmt", "symbols": ["test"], "postprocess": + (d) => { + const expr = d[0]; + return new StmtNS.SimpleExpr(expr.startToken, expr.endToken, expr); + } + }, + {"name": "compound_stmt", "symbols": ["if_stmt"], "postprocess": id}, + {"name": "compound_stmt", "symbols": ["while_stmt"], "postprocess": id}, + {"name": "compound_stmt", "symbols": ["for_stmt"], "postprocess": id}, + {"name": "compound_stmt", "symbols": ["funcdef"], "postprocess": id}, + {"name": "if_stmt", "symbols": [(pythonLexer.has("kw_if") ? {type: "kw_if"} : kw_if), "_", "test", "_", {"literal":":"}, "_", "suite", "elif_chain"], "postprocess": + (d) => { + const ifToken = toAstToken(d[0]); + const condition = d[2]; + const body = d[6]; + const elseBlock = d[7]; + const endToken = elseBlock ? elseBlock[elseBlock.length - 1]?.endToken : body[body.length - 1]?.endToken; + return new StmtNS.If(ifToken, endToken, condition, body, elseBlock); + } + }, + {"name": "elif_chain", "symbols": [(pythonLexer.has("kw_elif") ? {type: "kw_elif"} : kw_elif), "_", "test", "_", {"literal":":"}, "_", "suite", "elif_chain"], "postprocess": + (d) => { + const elifToken = toAstToken(d[0]); + const condition = d[2]; + const body = d[6]; + const elseBlock = d[7]; + const endToken = elseBlock ? elseBlock[elseBlock.length - 1]?.endToken : body[body.length - 1]?.endToken; + return [new StmtNS.If(elifToken, endToken, condition, body, elseBlock)]; + } + }, + {"name": "elif_chain", "symbols": [(pythonLexer.has("kw_else") ? {type: "kw_else"} : kw_else), "_", {"literal":":"}, "_", "suite"], "postprocess": + (d) => d[4] + }, + {"name": "elif_chain", "symbols": [], "postprocess": () => null}, + {"name": "while_stmt", "symbols": [(pythonLexer.has("kw_while") ? {type: "kw_while"} : kw_while), "_", "test", "_", {"literal":":"}, "_", "suite"], "postprocess": + (d) => { + const whileToken = toAstToken(d[0]); + const condition = d[2]; + const body = d[6]; + const endToken = body[body.length - 1]?.endToken || whileToken; + return new StmtNS.While(whileToken, endToken, condition, body); + } + }, + {"name": "for_stmt", "symbols": [(pythonLexer.has("kw_for") ? {type: "kw_for"} : kw_for), "_", (pythonLexer.has("identifier") ? {type: "identifier"} : identifier), "_", (pythonLexer.has("kw_in") ? {type: "kw_in"} : kw_in), "_", "test", "_", {"literal":":"}, "_", "suite"], "postprocess": + (d) => { + const forToken = toAstToken(d[0]); + const target = toAstToken(d[2]); + const iter = d[6]; + const body = d[10]; + const endToken = body[body.length - 1]?.endToken || forToken; + return new StmtNS.For(forToken, endToken, target, iter, body); + } + }, + {"name": "funcdef", "symbols": [(pythonLexer.has("kw_def") ? {type: "kw_def"} : kw_def), "_", (pythonLexer.has("identifier") ? {type: "identifier"} : identifier), "_", "parameters", "_", {"literal":":"}, "_", "suite"], "postprocess": + (d) => { + const defToken = toAstToken(d[0]); + const name = toAstToken(d[2]); + const params = d[4]; + const body = d[8]; + const endToken = body[body.length - 1]?.endToken || name; + return new StmtNS.FunctionDef(defToken, endToken, name, params, body, []); + } + }, + {"name": "parameters", "symbols": [{"literal":"("}, "_", {"literal":")"}], "postprocess": () => []}, + {"name": "parameters", "symbols": [{"literal":"("}, "_", "varargslist", "_", {"literal":")"}], "postprocess": (d) => d[2]}, + {"name": "varargslist", "symbols": [(pythonLexer.has("identifier") ? {type: "identifier"} : identifier)], "postprocess": (d) => [toAstToken(d[0])]}, + {"name": "varargslist", "symbols": ["varargslist", "_", {"literal":","}, "_", (pythonLexer.has("identifier") ? {type: "identifier"} : identifier)], "postprocess": (d) => d[0].concat([toAstToken(d[4])])}, + {"name": "suite", "symbols": ["simple_stmt"], "postprocess": (d) => [d[0]]}, + {"name": "suite", "symbols": [(pythonLexer.has("newline") ? {type: "newline"} : newline), (pythonLexer.has("INDENT") ? {type: "INDENT"} : INDENT), "suite_stmts", (pythonLexer.has("DEDENT") ? {type: "DEDENT"} : DEDENT)], "postprocess": (d) => d[2]}, + {"name": "suite_stmts", "symbols": ["statement"], "postprocess": (d) => [d[0]]}, + {"name": "suite_stmts", "symbols": ["suite_stmts", "statement"], "postprocess": (d) => d[0].concat([d[1]])}, + {"name": "suite_stmts", "symbols": ["suite_stmts", (pythonLexer.has("newline") ? {type: "newline"} : newline)], "postprocess": (d) => d[0]}, + {"name": "test", "symbols": ["or_test", "_", (pythonLexer.has("kw_if") ? {type: "kw_if"} : kw_if), "_", "or_test", "_", (pythonLexer.has("kw_else") ? {type: "kw_else"} : kw_else), "_", "test"], "postprocess": + (d) => { + const consequent = d[0]; + const predicate = d[4]; + const alternative = d[8]; + return new ExprNS.Ternary(consequent.startToken, alternative.endToken, predicate, consequent, alternative); + } + }, + {"name": "test", "symbols": ["or_test"], "postprocess": id}, + {"name": "test", "symbols": ["lambdef"], "postprocess": id}, + {"name": "lambdef", "symbols": [(pythonLexer.has("kw_lambda") ? {type: "kw_lambda"} : kw_lambda), "_", "varargslist", "_", {"literal":":"}, "_", "test"], "postprocess": + (d) => { + const lambdaToken = toAstToken(d[0]); + const params = d[2]; + const body = d[6]; + return new ExprNS.Lambda(lambdaToken, body.endToken, params, body); + } + }, + {"name": "lambdef", "symbols": [(pythonLexer.has("kw_lambda") ? {type: "kw_lambda"} : kw_lambda), "_", "varargslist", "_", {"literal":"::"}, "_", "suite"], "postprocess": + (d) => { + const lambdaToken = toAstToken(d[0]); + const params = d[2]; + const body = d[6]; + const endToken = body[body.length - 1]?.endToken || lambdaToken; + return new ExprNS.MultiLambda(lambdaToken, endToken, params, body, []); + } + }, + {"name": "lambdef", "symbols": [(pythonLexer.has("kw_lambda") ? {type: "kw_lambda"} : kw_lambda), "_", {"literal":":"}, "_", "test"], "postprocess": + (d) => { + const lambdaToken = toAstToken(d[0]); + const body = d[4]; + return new ExprNS.Lambda(lambdaToken, body.endToken, [], body); + } + }, + {"name": "lambdef", "symbols": [(pythonLexer.has("kw_lambda") ? {type: "kw_lambda"} : kw_lambda), "_", {"literal":"::"}, "_", "suite"], "postprocess": + (d) => { + const lambdaToken = toAstToken(d[0]); + const body = d[4]; + const endToken = body[body.length - 1]?.endToken || lambdaToken; + return new ExprNS.MultiLambda(lambdaToken, endToken, [], body, []); + } + }, + {"name": "or_test", "symbols": ["and_test", "_", (pythonLexer.has("kw_or") ? {type: "kw_or"} : kw_or), "_", "or_test"], "postprocess": + (d) => { + const left = d[0]; + const operator = toAstToken(d[2]); + const right = d[4]; + return new ExprNS.BoolOp(left.startToken, right.endToken, left, operator, right); + } + }, + {"name": "or_test", "symbols": ["and_test"], "postprocess": id}, + {"name": "and_test", "symbols": ["not_test", "_", (pythonLexer.has("kw_and") ? {type: "kw_and"} : kw_and), "_", "and_test"], "postprocess": + (d) => { + const left = d[0]; + const operator = toAstToken(d[2]); + const right = d[4]; + return new ExprNS.BoolOp(left.startToken, right.endToken, left, operator, right); + } + }, + {"name": "and_test", "symbols": ["not_test"], "postprocess": id}, + {"name": "not_test", "symbols": [(pythonLexer.has("kw_not") ? {type: "kw_not"} : kw_not), "_", "not_test"], "postprocess": + (d) => { + const operator = toAstToken(d[0]); + const right = d[2]; + return new ExprNS.Unary(operator, right.endToken, operator, right); + } + }, + {"name": "not_test", "symbols": ["comparison"], "postprocess": id}, + {"name": "comparison", "symbols": ["arith_expr", "_", "comp_op", "_", "comparison"], "postprocess": + (d) => { + const left = d[0]; + const operator = d[2]; + const right = d[4]; + return new ExprNS.Compare(left.startToken, right.endToken, left, operator, right); + } + }, + {"name": "comparison", "symbols": ["arith_expr"], "postprocess": id}, + {"name": "comp_op", "symbols": [(pythonLexer.has("less") ? {type: "less"} : less)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "comp_op", "symbols": [(pythonLexer.has("greater") ? {type: "greater"} : greater)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "comp_op", "symbols": [(pythonLexer.has("doubleequal") ? {type: "doubleequal"} : doubleequal)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "comp_op", "symbols": [(pythonLexer.has("greaterequal") ? {type: "greaterequal"} : greaterequal)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "comp_op", "symbols": [(pythonLexer.has("lessequal") ? {type: "lessequal"} : lessequal)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "comp_op", "symbols": [(pythonLexer.has("notequal") ? {type: "notequal"} : notequal)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "comp_op", "symbols": [(pythonLexer.has("kw_in") ? {type: "kw_in"} : kw_in)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "comp_op", "symbols": [(pythonLexer.has("kw_not") ? {type: "kw_not"} : kw_not), "_", (pythonLexer.has("kw_in") ? {type: "kw_in"} : kw_in)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + token.type = 'NOTIN'; + return token; + } + }, + {"name": "comp_op", "symbols": [(pythonLexer.has("kw_is") ? {type: "kw_is"} : kw_is)], "postprocess": + (d) => toAstToken(d[0]) + }, + {"name": "comp_op", "symbols": [(pythonLexer.has("kw_is") ? {type: "kw_is"} : kw_is), "_", (pythonLexer.has("kw_not") ? {type: "kw_not"} : kw_not)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + token.type = 'ISNOT'; + return token; + } + }, + {"name": "arith_expr", "symbols": ["term", "_", "arith_op", "_", "arith_expr"], "postprocess": + (d) => { + const left = d[0]; + const operator = d[2]; + const right = d[4]; + return new ExprNS.Binary(left.startToken, right.endToken, left, operator, right); + } + }, + {"name": "arith_expr", "symbols": ["term"], "postprocess": id}, + {"name": "arith_op", "symbols": [(pythonLexer.has("plus") ? {type: "plus"} : plus)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "arith_op", "symbols": [(pythonLexer.has("minus") ? {type: "minus"} : minus)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "term", "symbols": ["factor", "_", "term_op", "_", "term"], "postprocess": + (d) => { + const left = d[0]; + const operator = d[2]; + const right = d[4]; + return new ExprNS.Binary(left.startToken, right.endToken, left, operator, right); + } + }, + {"name": "term", "symbols": ["factor"], "postprocess": id}, + {"name": "term_op", "symbols": [(pythonLexer.has("star") ? {type: "star"} : star)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "term_op", "symbols": [(pythonLexer.has("slash") ? {type: "slash"} : slash)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "term_op", "symbols": [(pythonLexer.has("percent") ? {type: "percent"} : percent)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "term_op", "symbols": [(pythonLexer.has("doubleslash") ? {type: "doubleslash"} : doubleslash)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "factor", "symbols": ["unary_op", "_", "factor"], "postprocess": + (d) => { + const operator = d[0]; + const right = d[2]; + return new ExprNS.Unary(operator, right.endToken, operator, right); + } + }, + {"name": "factor", "symbols": ["power"], "postprocess": id}, + {"name": "unary_op", "symbols": [(pythonLexer.has("plus") ? {type: "plus"} : plus)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "unary_op", "symbols": [(pythonLexer.has("minus") ? {type: "minus"} : minus)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "power", "symbols": ["atom_expr", "_", (pythonLexer.has("doublestar") ? {type: "doublestar"} : doublestar), "_", "factor"], "postprocess": + (d) => { + const left = d[0]; + const operator = toAstToken(d[2]); + const right = d[4]; + return new ExprNS.Binary(left.startToken, right.endToken, left, operator, right); + } + }, + {"name": "power", "symbols": ["atom_expr"], "postprocess": id}, + {"name": "atom_expr", "symbols": ["atom", {"literal":"("}, "_", "test_list", "_", {"literal":")"}], "postprocess": + (d) => { + const callee = d[0]; + const args = d[3]; + const endToken = args.length > 0 ? args[args.length - 1].endToken : callee.endToken; + return new ExprNS.Call(callee.startToken, endToken, callee, args); + } + }, + {"name": "atom_expr", "symbols": ["atom", {"literal":"("}, "_", {"literal":")"}], "postprocess": + (d) => { + const callee = d[0]; + return new ExprNS.Call(callee.startToken, callee.endToken, callee, []); + } + }, + {"name": "atom_expr", "symbols": ["atom"], "postprocess": id}, + {"name": "test_list", "symbols": ["test"], "postprocess": (d) => [d[0]]}, + {"name": "test_list", "symbols": ["test_list", "_", {"literal":","}, "_", "test"], "postprocess": (d) => d[0].concat([d[4]])}, + {"name": "test_list", "symbols": ["test", "_", {"literal":","}], "postprocess": (d) => [d[0]]}, + {"name": "atom", "symbols": [{"literal":"("}, "_", "test", "_", {"literal":")"}], "postprocess": + (d) => { + const lparen = toAstToken({ type: 'LPAREN', value: '(', line: 0, col: 0, offset: 0 }); + const rparen = toAstToken({ type: 'RPAREN', value: ')', line: 0, col: 0, offset: 0 }); + return new ExprNS.Grouping(lparen, rparen, d[2]); + } + }, + {"name": "atom", "symbols": [(pythonLexer.has("identifier") ? {type: "identifier"} : identifier)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Variable(token, token, token); + } + }, + {"name": "atom", "symbols": [(pythonLexer.has("integer") ? {type: "integer"} : integer)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Literal(token, token, parseInt(token.lexeme)); + } + }, + {"name": "atom", "symbols": [(pythonLexer.has("float") ? {type: "float"} : float)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Literal(token, token, parseFloat(token.lexeme)); + } + }, + {"name": "atom", "symbols": [(pythonLexer.has("bigint") ? {type: "bigint"} : bigint)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.BigIntLiteral(token, token, token.lexeme.slice(0, -1)); + } + }, + {"name": "atom", "symbols": [(pythonLexer.has("complex") ? {type: "complex"} : complex)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Complex(token, token, token.lexeme); + } + }, + {"name": "atom", "symbols": ["string_literal"], "postprocess": + (d) => { + const token = d[0]; + return new ExprNS.Literal(token, token, token.lexeme); + } + }, + {"name": "atom", "symbols": [(pythonLexer.has("kw_None") ? {type: "kw_None"} : kw_None)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.None(token, token, "None"); + } + }, + {"name": "atom", "symbols": [(pythonLexer.has("kw_True") ? {type: "kw_True"} : kw_True)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Literal(token, token, true); + } + }, + {"name": "atom", "symbols": [(pythonLexer.has("kw_False") ? {type: "kw_False"} : kw_False)], "postprocess": + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Literal(token, token, false); + } + }, + {"name": "string_literal", "symbols": [(pythonLexer.has("stringTripleDouble") ? {type: "stringTripleDouble"} : stringTripleDouble)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "string_literal", "symbols": [(pythonLexer.has("stringTripleSingle") ? {type: "stringTripleSingle"} : stringTripleSingle)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "string_literal", "symbols": [(pythonLexer.has("stringDouble") ? {type: "stringDouble"} : stringDouble)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "string_literal", "symbols": [(pythonLexer.has("stringSingle") ? {type: "stringSingle"} : stringSingle)], "postprocess": (d) => toAstToken(d[0])}, + {"name": "_", "symbols": []}, + {"name": "_", "symbols": [(pythonLexer.has("ws") ? {type: "ws"} : ws)]}, + {"name": "__", "symbols": [(pythonLexer.has("ws") ? {type: "ws"} : ws)]} +]; +let ParserStart = "file_input"; +export default { Lexer, ParserRules, ParserStart }; diff --git a/src/nearley/python.ne b/src/nearley/python.ne new file mode 100644 index 0000000..990e608 --- /dev/null +++ b/src/nearley/python.ne @@ -0,0 +1,612 @@ +# Nearley grammar for Python subset (Source Academy) +# Based on Grammar.gram but converted to Nearley syntax + +@preprocessor esmodule + +@{% +import { pythonLexer } from './lexer'; +import { ExprNS, StmtNS } from '../ast-types'; +import { Token as AstToken } from '../tokenizer/tokenizer'; +import { TokenType } from '../tokens'; + +const tokenTypeMap: { [key: string]: TokenType } = { + 'identifier': TokenType.NAME, + 'integer': TokenType.NUMBER, + 'float': TokenType.NUMBER, + 'bigint': TokenType.BIGINT, + 'complex': TokenType.COMPLEX, + 'stringTripleDouble': TokenType.STRING, + 'stringTripleSingle': TokenType.STRING, + 'stringDouble': TokenType.STRING, + 'stringSingle': TokenType.STRING, + + 'kw_def': TokenType.DEF, + 'kw_if': TokenType.IF, + 'kw_elif': TokenType.ELIF, + 'kw_else': TokenType.ELSE, + 'kw_while': TokenType.WHILE, + 'kw_for': TokenType.FOR, + 'kw_in': TokenType.IN, + 'kw_return': TokenType.RETURN, + 'kw_pass': TokenType.PASS, + 'kw_break': TokenType.BREAK, + 'kw_continue': TokenType.CONTINUE, + 'kw_and': TokenType.AND, + 'kw_or': TokenType.OR, + 'kw_not': TokenType.NOT, + 'kw_is': TokenType.IS, + 'kw_lambda': TokenType.LAMBDA, + 'kw_from': TokenType.FROM, + 'kw_import': TokenType.IMPORT, + 'kw_global': TokenType.GLOBAL, + 'kw_nonlocal': TokenType.NONLOCAL, + 'kw_assert': TokenType.ASSERT, + 'kw_True': TokenType.TRUE, + 'kw_False': TokenType.FALSE, + 'kw_None': TokenType.NONE, + + 'doublestar': TokenType.DOUBLESTAR, + 'doubleslash': TokenType.DOUBLESLASH, + 'doubleequal': TokenType.DOUBLEEQUAL, + 'notequal': TokenType.NOTEQUAL, + 'lessequal': TokenType.LESSEQUAL, + 'greaterequal': TokenType.GREATEREQUAL, + 'doublecolon': TokenType.DOUBLECOLON, + 'ellipsis': TokenType.ELLIPSIS, + + 'lparen': TokenType.LPAR, + 'rparen': TokenType.RPAR, + 'lsqb': TokenType.LSQB, + 'rsqb': TokenType.RSQB, + 'colon': TokenType.COLON, + 'comma': TokenType.COMMA, + 'plus': TokenType.PLUS, + 'minus': TokenType.MINUS, + 'star': TokenType.STAR, + 'slash': TokenType.SLASH, + 'percent': TokenType.PERCENT, + 'less': TokenType.LESS, + 'greater': TokenType.GREATER, + 'equal': TokenType.EQUAL, + 'dot': TokenType.DOT, + 'semi': TokenType.SEMI, + 'lbrace': TokenType.LBRACE, + 'rbrace': TokenType.RBRACE, + + 'INDENT': TokenType.INDENT, + 'DEDENT': TokenType.DEDENT, + 'newline': TokenType.NEWLINE, + 'EOF': TokenType.ENDMARKER, + 'NOTIN': TokenType.NOTIN, + 'ISNOT': TokenType.ISNOT, +}; + + +// Helper to convert moo tokens to AST tokens +function toAstToken(token: any): AstToken { + const type = tokenTypeMap[token.type] || TokenType.NAME; + return new AstToken( + type, + token.value, + token.line || 0, + token.col || 0, + token.offset || 0 + ); +} + +// Helper to get token type string +function tokenType(d: any, index: number): string { + return d[index]?.type || ''; +} +%} + +@lexer pythonLexer + +# Start symbol +file_input -> + _ statements _ %EOF {% + (d) => { + const startToken = d[1][0]?.startToken || toAstToken({ type: 'ENDMARKER', value: '', line: 0, col: 0, offset: 0 }); + const endToken = d[1][d[1].length - 1]?.endToken || startToken; + return new StmtNS.FileInput(startToken, endToken, d[1], []); + } + %} + +statements -> + null {% () => [] %} + | statements statement {% (d) => d[0].concat([d[1]]) %} + | statements %newline {% (d) => d[0] %} + +statement -> + simple_stmt {% id %} + | compound_stmt {% id %} + +# Simple statements +simple_stmt -> + small_stmt %newline {% (d) => d[0] %} + +small_stmt -> + assign_stmt {% id %} + | pass_stmt {% id %} + | flow_stmt {% id %} + | import_stmt {% id %} + | global_stmt {% id %} + | nonlocal_stmt {% id %} + | assert_stmt {% id %} + | expr_stmt {% id %} + +assign_stmt -> + %identifier _ ":" _ test _ "=" _ test {% + (d) => { + const name = toAstToken(d[0]); + const value = d[8]; + const ann = d[4]; + return new StmtNS.AnnAssign(name, value.endToken, name, value, ann); + } + %} + | %identifier _ ":" _ test {% + (d) => { + const name = toAstToken(d[0]); + const ann = d[4]; + const value = new ExprNS.None(name, name, "None"); + return new StmtNS.AnnAssign(name, ann.endToken, name, value, ann); + } + %} + | %identifier _ "=" _ test {% + (d) => { + const name = toAstToken(d[0]); + const value = d[4]; + return new StmtNS.Assign(name, value.endToken, name, value); + } + %} + +pass_stmt -> + %kw_pass {% + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Pass(token, token); + } + %} + +flow_stmt -> + break_stmt {% id %} + | continue_stmt {% id %} + | return_stmt {% id %} + +break_stmt -> + %kw_break {% + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Break(token, token); + } + %} + +continue_stmt -> + %kw_continue {% + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Continue(token, token); + } + %} + +return_stmt -> + %kw_return _ test {% + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Return(token, d[2].endToken, d[2]); + } + %} + | %kw_return {% + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Return(token, token, null); + } + %} + +import_stmt -> + %kw_from _ %identifier _ %kw_import _ import_names {% + (d) => { + const fromToken = toAstToken(d[0]); + const module = toAstToken(d[2]); + const names = d[6]; + return new StmtNS.FromImport(fromToken, names[names.length - 1], module, names); + } + %} + +import_names -> + %identifier {% (d) => [toAstToken(d[0])] %} + | "(" _ name_list _ ")" {% (d) => d[2] %} + +name_list -> + %identifier {% (d) => [toAstToken(d[0])] %} + | name_list _ "," _ %identifier {% (d) => d[0].concat([toAstToken(d[4])]) %} + +global_stmt -> + %kw_global _ %identifier {% + (d) => { + const token = toAstToken(d[0]); + const name = toAstToken(d[2]); + return new StmtNS.Global(token, name, name); + } + %} + +nonlocal_stmt -> + %kw_nonlocal _ %identifier {% + (d) => { + const token = toAstToken(d[0]); + const name = toAstToken(d[2]); + return new StmtNS.NonLocal(token, name, name); + } + %} + +assert_stmt -> + %kw_assert _ test {% + (d) => { + const token = toAstToken(d[0]); + return new StmtNS.Assert(token, d[2].endToken, d[2]); + } + %} + +expr_stmt -> + test {% + (d) => { + const expr = d[0]; + return new StmtNS.SimpleExpr(expr.startToken, expr.endToken, expr); + } + %} + +# Compound statements +compound_stmt -> + if_stmt {% id %} + | while_stmt {% id %} + | for_stmt {% id %} + | funcdef {% id %} + +if_stmt -> + %kw_if _ test _ ":" _ suite elif_chain {% + (d) => { + const ifToken = toAstToken(d[0]); + const condition = d[2]; + const body = d[6]; + const elseBlock = d[7]; + const endToken = elseBlock ? elseBlock[elseBlock.length - 1]?.endToken : body[body.length - 1]?.endToken; + return new StmtNS.If(ifToken, endToken, condition, body, elseBlock); + } + %} + +elif_chain -> + %kw_elif _ test _ ":" _ suite elif_chain {% + (d) => { + const elifToken = toAstToken(d[0]); + const condition = d[2]; + const body = d[6]; + const elseBlock = d[7]; + const endToken = elseBlock ? elseBlock[elseBlock.length - 1]?.endToken : body[body.length - 1]?.endToken; + return [new StmtNS.If(elifToken, endToken, condition, body, elseBlock)]; + } + %} + | %kw_else _ ":" _ suite {% + (d) => d[4] + %} + | null {% () => null %} + +while_stmt -> + %kw_while _ test _ ":" _ suite {% + (d) => { + const whileToken = toAstToken(d[0]); + const condition = d[2]; + const body = d[6]; + const endToken = body[body.length - 1]?.endToken || whileToken; + return new StmtNS.While(whileToken, endToken, condition, body); + } + %} + +for_stmt -> + %kw_for _ %identifier _ %kw_in _ test _ ":" _ suite {% + (d) => { + const forToken = toAstToken(d[0]); + const target = toAstToken(d[2]); + const iter = d[6]; + const body = d[10]; + const endToken = body[body.length - 1]?.endToken || forToken; + return new StmtNS.For(forToken, endToken, target, iter, body); + } + %} + +funcdef -> + %kw_def _ %identifier _ parameters _ ":" _ suite {% + (d) => { + const defToken = toAstToken(d[0]); + const name = toAstToken(d[2]); + const params = d[4]; + const body = d[8]; + const endToken = body[body.length - 1]?.endToken || name; + return new StmtNS.FunctionDef(defToken, endToken, name, params, body, []); + } + %} + +parameters -> + "(" _ ")" {% () => [] %} + | "(" _ varargslist _ ")" {% (d) => d[2] %} + +varargslist -> + %identifier {% (d) => [toAstToken(d[0])] %} + | varargslist _ "," _ %identifier {% (d) => d[0].concat([toAstToken(d[4])]) %} + +suite -> + simple_stmt {% (d) => [d[0]] %} + | %newline %INDENT suite_stmts %DEDENT {% (d) => d[2] %} + +suite_stmts -> + statement {% (d) => [d[0]] %} + | suite_stmts statement {% (d) => d[0].concat([d[1]]) %} + | suite_stmts %newline {% (d) => d[0] %} + +# Expressions (following precedence) +test -> + or_test _ %kw_if _ or_test _ %kw_else _ test {% + (d) => { + const consequent = d[0]; + const predicate = d[4]; + const alternative = d[8]; + return new ExprNS.Ternary(consequent.startToken, alternative.endToken, predicate, consequent, alternative); + } + %} + | or_test {% id %} + | lambdef {% id %} + +lambdef -> + %kw_lambda _ varargslist _ ":" _ test {% + (d) => { + const lambdaToken = toAstToken(d[0]); + const params = d[2]; + const body = d[6]; + return new ExprNS.Lambda(lambdaToken, body.endToken, params, body); + } + %} + | %kw_lambda _ varargslist _ "::" _ suite {% + (d) => { + const lambdaToken = toAstToken(d[0]); + const params = d[2]; + const body = d[6]; + const endToken = body[body.length - 1]?.endToken || lambdaToken; + return new ExprNS.MultiLambda(lambdaToken, endToken, params, body, []); + } + %} + | %kw_lambda _ ":" _ test {% + (d) => { + const lambdaToken = toAstToken(d[0]); + const body = d[4]; + return new ExprNS.Lambda(lambdaToken, body.endToken, [], body); + } + %} + | %kw_lambda _ "::" _ suite {% + (d) => { + const lambdaToken = toAstToken(d[0]); + const body = d[4]; + const endToken = body[body.length - 1]?.endToken || lambdaToken; + return new ExprNS.MultiLambda(lambdaToken, endToken, [], body, []); + } + %} + +or_test -> + and_test _ %kw_or _ or_test {% + (d) => { + const left = d[0]; + const operator = toAstToken(d[2]); + const right = d[4]; + return new ExprNS.BoolOp(left.startToken, right.endToken, left, operator, right); + } + %} + | and_test {% id %} + +and_test -> + not_test _ %kw_and _ and_test {% + (d) => { + const left = d[0]; + const operator = toAstToken(d[2]); + const right = d[4]; + return new ExprNS.BoolOp(left.startToken, right.endToken, left, operator, right); + } + %} + | not_test {% id %} + +not_test -> + %kw_not _ not_test {% + (d) => { + const operator = toAstToken(d[0]); + const right = d[2]; + return new ExprNS.Unary(operator, right.endToken, operator, right); + } + %} + | comparison {% id %} + +comparison -> + arith_expr _ comp_op _ comparison {% + (d) => { + const left = d[0]; + const operator = d[2]; + const right = d[4]; + return new ExprNS.Compare(left.startToken, right.endToken, left, operator, right); + } + %} + | arith_expr {% id %} + +comp_op -> + %less {% (d) => toAstToken(d[0]) %} + | %greater {% (d) => toAstToken(d[0]) %} + | %doubleequal {% (d) => toAstToken(d[0]) %} + | %greaterequal {% (d) => toAstToken(d[0]) %} + | %lessequal {% (d) => toAstToken(d[0]) %} + | %notequal {% (d) => toAstToken(d[0]) %} + | %kw_in {% (d) => toAstToken(d[0]) %} + | %kw_not _ %kw_in {% + (d) => { + const token = toAstToken(d[0]); + token.type = 'NOTIN'; + return token; + } + %} + | %kw_is {% + (d) => toAstToken(d[0]) + %} + | %kw_is _ %kw_not {% + (d) => { + const token = toAstToken(d[0]); + token.type = 'ISNOT'; + return token; + } + %} + +arith_expr -> + term _ arith_op _ arith_expr {% + (d) => { + const left = d[0]; + const operator = d[2]; + const right = d[4]; + return new ExprNS.Binary(left.startToken, right.endToken, left, operator, right); + } + %} + | term {% id %} + +arith_op -> + %plus {% (d) => toAstToken(d[0]) %} + | %minus {% (d) => toAstToken(d[0]) %} + +term -> + factor _ term_op _ term {% + (d) => { + const left = d[0]; + const operator = d[2]; + const right = d[4]; + return new ExprNS.Binary(left.startToken, right.endToken, left, operator, right); + } + %} + | factor {% id %} + +term_op -> + %star {% (d) => toAstToken(d[0]) %} + | %slash {% (d) => toAstToken(d[0]) %} + | %percent {% (d) => toAstToken(d[0]) %} + | %doubleslash {% (d) => toAstToken(d[0]) %} + +factor -> + unary_op _ factor {% + (d) => { + const operator = d[0]; + const right = d[2]; + return new ExprNS.Unary(operator, right.endToken, operator, right); + } + %} + | power {% id %} + +unary_op -> + %plus {% (d) => toAstToken(d[0]) %} + | %minus {% (d) => toAstToken(d[0]) %} + +power -> + atom_expr _ %doublestar _ factor {% + (d) => { + const left = d[0]; + const operator = toAstToken(d[2]); + const right = d[4]; + return new ExprNS.Binary(left.startToken, right.endToken, left, operator, right); + } + %} + | atom_expr {% id %} + +atom_expr -> + atom "(" _ test_list _ ")" {% + (d) => { + const callee = d[0]; + const args = d[3]; + const endToken = args.length > 0 ? args[args.length - 1].endToken : callee.endToken; + return new ExprNS.Call(callee.startToken, endToken, callee, args); + } + %} + | atom "(" _ ")" {% + (d) => { + const callee = d[0]; + return new ExprNS.Call(callee.startToken, callee.endToken, callee, []); + } + %} + | atom {% id %} + +test_list -> + test {% (d) => [d[0]] %} + | test_list _ "," _ test {% (d) => d[0].concat([d[4]]) %} + | test _ "," {% (d) => [d[0]] %} + +atom -> + "(" _ test _ ")" {% + (d) => { + const lparen = toAstToken({ type: 'LPAREN', value: '(', line: 0, col: 0, offset: 0 }); + const rparen = toAstToken({ type: 'RPAREN', value: ')', line: 0, col: 0, offset: 0 }); + return new ExprNS.Grouping(lparen, rparen, d[2]); + } + %} + | %identifier {% + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Variable(token, token, token); + } + %} + | %integer {% + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Literal(token, token, parseInt(token.lexeme)); + } + %} + | %float {% + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Literal(token, token, parseFloat(token.lexeme)); + } + %} + | %bigint {% + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.BigIntLiteral(token, token, token.lexeme.slice(0, -1)); + } + %} + | %complex {% + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Complex(token, token, token.lexeme); + } + %} + | string_literal {% + (d) => { + const token = d[0]; + return new ExprNS.Literal(token, token, token.lexeme); + } + %} + | %kw_None {% + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.None(token, token, "None"); + } + %} + | %kw_True {% + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Literal(token, token, true); + } + %} + | %kw_False {% + (d) => { + const token = toAstToken(d[0]); + return new ExprNS.Literal(token, token, false); + } + %} + +string_literal -> + %stringTripleDouble {% (d) => toAstToken(d[0]) %} + | %stringTripleSingle {% (d) => toAstToken(d[0]) %} + | %stringDouble {% (d) => toAstToken(d[0]) %} + | %stringSingle {% (d) => toAstToken(d[0]) %} + +# Whitespace (optional) +_ -> null | %ws +__ -> %ws + diff --git a/src/runner/pyRunner.ts b/src/runner/pyRunner.ts index 7a81aed..899f029 100644 --- a/src/runner/pyRunner.ts +++ b/src/runner/pyRunner.ts @@ -3,6 +3,7 @@ import { CSEResultPromise, evaluate } from "../cse-machine/interpreter"; import { RecursivePartial, Result } from "../types"; import { Tokenizer } from "../tokenizer"; import { Parser } from "../parser"; +import { parse } from '../nearley/parser-adapter'; import { Resolver } from "../resolver"; import { Program } from "estree"; import { Translator } from "../translator"; @@ -20,10 +21,11 @@ function parsePythonToEstreeAst( doValidate: boolean = false ): Program { const script = code + "\n"; - const tokenizer = new Tokenizer(script); - const tokens = tokenizer.scanEverything(); - const pyParser = new Parser(script, tokens); - const ast = pyParser.parse(); + // const tokenizer = new Tokenizer(script); + // const tokens = tokenizer.scanEverything(); + // const pyParser = new Parser(script, tokens); + // const ast = pyParser.parse(); + const ast = parse(script); if (doValidate) { new Resolver(script, ast).resolve(ast); } From 26474de35416b7d8e18b381d06654a0f7cfe09d1 Mon Sep 17 00:00:00 2001 From: loyaltypollution <65063925+loyaltypollution@users.noreply.github.com> Date: Thu, 30 Oct 2025 02:00:36 +0800 Subject: [PATCH 3/4] Fix bigint generation --- src/nearley/lexer.ts | 3 +-- src/nearley/python-grammar.ts | 11 ++--------- src/nearley/python.ne | 11 ++--------- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/src/nearley/lexer.ts b/src/nearley/lexer.ts index 51c0f99..ebba344 100644 --- a/src/nearley/lexer.ts +++ b/src/nearley/lexer.ts @@ -18,9 +18,8 @@ export const lexer = moo.compile({ // Numbers complex: /(?:\d+\.?\d*|\.\d+)[jJ]/, - bigint: /\d+[nN]/, + bigint: /\d+/, float: /(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?/, - integer: /\d+/, // Strings (simplified - doesn't handle all edge cases yet) stringTripleDouble: /"""(?:[^"\\]|\\["\\/bfnrt]|\\u[a-fA-F0-9]{4})*?"""/, diff --git a/src/nearley/python-grammar.ts b/src/nearley/python-grammar.ts index a5ffe7e..c50cd02 100644 --- a/src/nearley/python-grammar.ts +++ b/src/nearley/python-grammar.ts @@ -9,7 +9,6 @@ import { TokenType } from '../tokens'; const tokenTypeMap: { [key: string]: TokenType } = { 'identifier': TokenType.NAME, - 'integer': TokenType.NUMBER, 'float': TokenType.NUMBER, 'bigint': TokenType.BIGINT, 'complex': TokenType.COMPLEX, @@ -86,7 +85,7 @@ function toAstToken(token: any): AstToken { return new AstToken( type, token.value, - token.line || 0, + token.line - 1 || 0, token.col || 0, token.offset || 0 ); @@ -454,12 +453,6 @@ let ParserRules = [ return new ExprNS.Variable(token, token, token); } }, - {"name": "atom", "symbols": [(pythonLexer.has("integer") ? {type: "integer"} : integer)], "postprocess": - (d) => { - const token = toAstToken(d[0]); - return new ExprNS.Literal(token, token, parseInt(token.lexeme)); - } - }, {"name": "atom", "symbols": [(pythonLexer.has("float") ? {type: "float"} : float)], "postprocess": (d) => { const token = toAstToken(d[0]); @@ -469,7 +462,7 @@ let ParserRules = [ {"name": "atom", "symbols": [(pythonLexer.has("bigint") ? {type: "bigint"} : bigint)], "postprocess": (d) => { const token = toAstToken(d[0]); - return new ExprNS.BigIntLiteral(token, token, token.lexeme.slice(0, -1)); + return new ExprNS.BigIntLiteral(token, token, token.lexeme); } }, {"name": "atom", "symbols": [(pythonLexer.has("complex") ? {type: "complex"} : complex)], "postprocess": diff --git a/src/nearley/python.ne b/src/nearley/python.ne index 990e608..6f1541c 100644 --- a/src/nearley/python.ne +++ b/src/nearley/python.ne @@ -11,7 +11,6 @@ import { TokenType } from '../tokens'; const tokenTypeMap: { [key: string]: TokenType } = { 'identifier': TokenType.NAME, - 'integer': TokenType.NUMBER, 'float': TokenType.NUMBER, 'bigint': TokenType.BIGINT, 'complex': TokenType.COMPLEX, @@ -88,7 +87,7 @@ function toAstToken(token: any): AstToken { return new AstToken( type, token.value, - token.line || 0, + token.line - 1 || 0, token.col || 0, token.offset || 0 ); @@ -551,12 +550,6 @@ atom -> return new ExprNS.Variable(token, token, token); } %} - | %integer {% - (d) => { - const token = toAstToken(d[0]); - return new ExprNS.Literal(token, token, parseInt(token.lexeme)); - } - %} | %float {% (d) => { const token = toAstToken(d[0]); @@ -566,7 +559,7 @@ atom -> | %bigint {% (d) => { const token = toAstToken(d[0]); - return new ExprNS.BigIntLiteral(token, token, token.lexeme.slice(0, -1)); + return new ExprNS.BigIntLiteral(token, token, token.lexeme); } %} | %complex {% From 67df345d2ddc95e6ae246b96c01009ec79e9bcae Mon Sep 17 00:00:00 2001 From: loyaltypollution <65063925+loyaltypollution@users.noreply.github.com> Date: Thu, 30 Oct 2025 02:13:09 +0800 Subject: [PATCH 4/4] Remove type:module in package.json --- package.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/package.json b/package.json index 0686ada..5e97965 100644 --- a/package.json +++ b/package.json @@ -4,13 +4,10 @@ "description": "", "main": "dist/index.js", "types": "dist/index.d.ts", - "type": "module", "scripts": { "regen": "npm run build && node dist/generate.js", "compile-grammar": "nearleyc src/nearley/python.ne -o src/nearley/python-grammar.ts", - "start:dev": "npx nodemon", "build": "rollup -c --bundleConfigAsCjs", - "start": "npm run build && node dist/index.js", "jsdoc": "./scripts/jsdoc.sh", "test": "jest" },