From e7ba93870ed270d55172333f20616cd9a0f8a3a3 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Wed, 1 Jun 2022 19:33:42 +0200 Subject: [PATCH] Add simple atoms --- grammar.js | 35 +++++- src/grammar.json | 139 ++++++++++++++++++++++- src/node-types.json | 45 +++++++- src/parser.c | 238 ++++++++++++++++++++++++++++++++++----- test/corpus/literals.txt | 110 ++++++++++++++++++ 5 files changed, 530 insertions(+), 37 deletions(-) create mode 100644 test/corpus/literals.txt diff --git a/grammar.js b/grammar.js index f768f8c..93096c4 100644 --- a/grammar.js +++ b/grammar.js @@ -2,8 +2,39 @@ module.exports = grammar({ name: "tiger", rules: { - // TODO: add the actual grammar rules - source_file: $ => 'hello' + source_file: ($) => choice( + $._expr, + ), + + _expr: ($) => choice( + "nil", + $.integer_literal, + $.string_literal, + ), + + integer_literal: (_) => /[0-9]+/, + + string_literal: ($) => seq( + '"', + repeat(choice($.escape_sequence, /[^"\\]+/)), + '"', + ), + + escape_sequence: (_) => token.immediate( + seq( + "\\", + choice( + // Special escapes + choice("a", "b", "f", "n", "r", "t", "v"), + // Octal + /[0-3][0-7]{2}/, + // Hexadecimal + seq("x", /[0-9a-fA-F]{2}/), + // Escaped characters + choice("\\", '"'), + ) + ) + ), } }); diff --git a/src/grammar.json b/src/grammar.json index d3fe927..49406f9 100644 --- a/src/grammar.json +++ b/src/grammar.json @@ -2,8 +2,143 @@ "name": "tiger", "rules": { "source_file": { - "type": "STRING", - "value": "hello" + "type": "CHOICE", + "members": [ + { + "type": "SYMBOL", + "name": "_expr" + } + ] + }, + "_expr": { + "type": "CHOICE", + "members": [ + { + "type": "STRING", + "value": "nil" + }, + { + "type": "SYMBOL", + "name": "integer_literal" + }, + { + "type": "SYMBOL", + "name": "string_literal" + } + ] + }, + "integer_literal": { + "type": "PATTERN", + "value": "[0-9]+" + }, + "string_literal": { + "type": "SEQ", + "members": [ + { + "type": "STRING", + "value": "\"" + }, + { + "type": "REPEAT", + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SYMBOL", + "name": "escape_sequence" + }, + { + "type": "PATTERN", + "value": "[^\"\\\\]+" + } + ] + } + }, + { + "type": "STRING", + "value": "\"" + } + ] + }, + "escape_sequence": { + "type": "IMMEDIATE_TOKEN", + "content": { + "type": "SEQ", + "members": [ + { + "type": "STRING", + "value": "\\" + }, + { + "type": "CHOICE", + "members": [ + { + "type": "CHOICE", + "members": [ + { + "type": "STRING", + "value": "a" + }, + { + "type": "STRING", + "value": "b" + }, + { + "type": "STRING", + "value": "f" + }, + { + "type": "STRING", + "value": "n" + }, + { + "type": "STRING", + "value": "r" + }, + { + "type": "STRING", + "value": "t" + }, + { + "type": "STRING", + "value": "v" + } + ] + }, + { + "type": "PATTERN", + "value": "[0-3][0-7]{2}" + }, + { + "type": "SEQ", + "members": [ + { + "type": "STRING", + "value": "x" + }, + { + "type": "PATTERN", + "value": "[0-9a-fA-F]{2}" + } + ] + }, + { + "type": "CHOICE", + "members": [ + { + "type": "STRING", + "value": "\\" + }, + { + "type": "STRING", + "value": "\"" + } + ] + } + ] + } + ] + } } }, "extras": [ diff --git a/src/node-types.json b/src/node-types.json index 43a6442..8cb6f0e 100644 --- a/src/node-types.json +++ b/src/node-types.json @@ -2,10 +2,51 @@ { "type": "source_file", "named": true, - "fields": {} + "fields": {}, + "children": { + "multiple": false, + "required": false, + "types": [ + { + "type": "integer_literal", + "named": true + }, + { + "type": "string_literal", + "named": true + } + ] + } }, { - "type": "hello", + "type": "string_literal", + "named": true, + "fields": {}, + "children": { + "multiple": true, + "required": false, + "types": [ + { + "type": "escape_sequence", + "named": true + } + ] + } + }, + { + "type": "\"", + "named": false + }, + { + "type": "escape_sequence", + "named": true + }, + { + "type": "integer_literal", + "named": true + }, + { + "type": "nil", "named": false } ] \ No newline at end of file diff --git a/src/parser.c b/src/parser.c index 1c8116e..8fadf25 100644 --- a/src/parser.c +++ b/src/parser.c @@ -6,31 +6,52 @@ #endif #define LANGUAGE_VERSION 13 -#define STATE_COUNT 4 +#define STATE_COUNT 9 #define LARGE_STATE_COUNT 2 -#define SYMBOL_COUNT 3 +#define SYMBOL_COUNT 10 #define ALIAS_COUNT 0 -#define TOKEN_COUNT 2 +#define TOKEN_COUNT 6 #define EXTERNAL_TOKEN_COUNT 0 #define FIELD_COUNT 0 -#define MAX_ALIAS_SEQUENCE_LENGTH 1 +#define MAX_ALIAS_SEQUENCE_LENGTH 3 #define PRODUCTION_ID_COUNT 1 enum { - anon_sym_hello = 1, - sym_source_file = 2, + anon_sym_nil = 1, + sym_integer_literal = 2, + anon_sym_DQUOTE = 3, + aux_sym_string_literal_token1 = 4, + sym_escape_sequence = 5, + sym_source_file = 6, + sym__expr = 7, + sym_string_literal = 8, + aux_sym_string_literal_repeat1 = 9, }; static const char * const ts_symbol_names[] = { [ts_builtin_sym_end] = "end", - [anon_sym_hello] = "hello", + [anon_sym_nil] = "nil", + [sym_integer_literal] = "integer_literal", + [anon_sym_DQUOTE] = "\"", + [aux_sym_string_literal_token1] = "string_literal_token1", + [sym_escape_sequence] = "escape_sequence", [sym_source_file] = "source_file", + [sym__expr] = "_expr", + [sym_string_literal] = "string_literal", + [aux_sym_string_literal_repeat1] = "string_literal_repeat1", }; static const TSSymbol ts_symbol_map[] = { [ts_builtin_sym_end] = ts_builtin_sym_end, - [anon_sym_hello] = anon_sym_hello, + [anon_sym_nil] = anon_sym_nil, + [sym_integer_literal] = sym_integer_literal, + [anon_sym_DQUOTE] = anon_sym_DQUOTE, + [aux_sym_string_literal_token1] = aux_sym_string_literal_token1, + [sym_escape_sequence] = sym_escape_sequence, [sym_source_file] = sym_source_file, + [sym__expr] = sym__expr, + [sym_string_literal] = sym_string_literal, + [aux_sym_string_literal_repeat1] = aux_sym_string_literal_repeat1, }; static const TSSymbolMetadata ts_symbol_metadata[] = { @@ -38,14 +59,42 @@ static const TSSymbolMetadata ts_symbol_metadata[] = { .visible = false, .named = true, }, - [anon_sym_hello] = { + [anon_sym_nil] = { .visible = true, .named = false, }, + [sym_integer_literal] = { + .visible = true, + .named = true, + }, + [anon_sym_DQUOTE] = { + .visible = true, + .named = false, + }, + [aux_sym_string_literal_token1] = { + .visible = false, + .named = false, + }, + [sym_escape_sequence] = { + .visible = true, + .named = true, + }, [sym_source_file] = { .visible = true, .named = true, }, + [sym__expr] = { + .visible = false, + .named = true, + }, + [sym_string_literal] = { + .visible = true, + .named = true, + }, + [aux_sym_string_literal_repeat1] = { + .visible = false, + .named = false, + }, }; static const TSSymbol ts_alias_sequences[PRODUCTION_ID_COUNT][MAX_ALIAS_SEQUENCE_LENGTH] = { @@ -61,30 +110,101 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) { eof = lexer->eof(lexer); switch (state) { case 0: - if (eof) ADVANCE(5); - if (lookahead == 'h') ADVANCE(1); + if (eof) ADVANCE(10); + if (lookahead == '"') ADVANCE(13); + if (lookahead == '\\') ADVANCE(6); + if (lookahead == 'n') ADVANCE(2); if (lookahead == '\t' || lookahead == '\n' || lookahead == '\r' || - lookahead == ' ') SKIP(0) + lookahead == ' ') SKIP(9) + if (('0' <= lookahead && lookahead <= '9')) ADVANCE(12); END_STATE(); case 1: - if (lookahead == 'e') ADVANCE(3); + if (lookahead == '"') ADVANCE(13); + if (lookahead == '\\') ADVANCE(6); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') ADVANCE(14); + if (lookahead != 0) ADVANCE(15); END_STATE(); case 2: - if (lookahead == 'l') ADVANCE(4); + if (lookahead == 'i') ADVANCE(3); END_STATE(); case 3: - if (lookahead == 'l') ADVANCE(2); + if (lookahead == 'l') ADVANCE(11); END_STATE(); case 4: - if (lookahead == 'o') ADVANCE(6); + if (('0' <= lookahead && lookahead <= '7')) ADVANCE(16); END_STATE(); case 5: - ACCEPT_TOKEN(ts_builtin_sym_end); + if (('0' <= lookahead && lookahead <= '7')) ADVANCE(4); END_STATE(); case 6: - ACCEPT_TOKEN(anon_sym_hello); + if (lookahead == '"' || + lookahead == '\\' || + lookahead == 'a' || + lookahead == 'b' || + lookahead == 'f' || + lookahead == 'n' || + lookahead == 'r' || + lookahead == 't' || + lookahead == 'v') ADVANCE(16); + if (lookahead == 'x') ADVANCE(8); + if (('0' <= lookahead && lookahead <= '3')) ADVANCE(5); + END_STATE(); + case 7: + if (('0' <= lookahead && lookahead <= '9') || + ('A' <= lookahead && lookahead <= 'F') || + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(16); + END_STATE(); + case 8: + if (('0' <= lookahead && lookahead <= '9') || + ('A' <= lookahead && lookahead <= 'F') || + ('a' <= lookahead && lookahead <= 'f')) ADVANCE(7); + END_STATE(); + case 9: + if (eof) ADVANCE(10); + if (lookahead == '"') ADVANCE(13); + if (lookahead == 'n') ADVANCE(2); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') SKIP(9) + if (('0' <= lookahead && lookahead <= '9')) ADVANCE(12); + END_STATE(); + case 10: + ACCEPT_TOKEN(ts_builtin_sym_end); + END_STATE(); + case 11: + ACCEPT_TOKEN(anon_sym_nil); + END_STATE(); + case 12: + ACCEPT_TOKEN(sym_integer_literal); + if (('0' <= lookahead && lookahead <= '9')) ADVANCE(12); + END_STATE(); + case 13: + ACCEPT_TOKEN(anon_sym_DQUOTE); + END_STATE(); + case 14: + ACCEPT_TOKEN(aux_sym_string_literal_token1); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') ADVANCE(14); + if (lookahead != 0 && + lookahead != '"' && + lookahead != '\\') ADVANCE(15); + END_STATE(); + case 15: + ACCEPT_TOKEN(aux_sym_string_literal_token1); + if (lookahead != 0 && + lookahead != '"' && + lookahead != '\\') ADVANCE(15); + END_STATE(); + case 16: + ACCEPT_TOKEN(sym_escape_sequence); END_STATE(); default: return false; @@ -94,41 +214,97 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) { static const TSLexMode ts_lex_modes[STATE_COUNT] = { [0] = {.lex_state = 0}, [1] = {.lex_state = 0}, - [2] = {.lex_state = 0}, - [3] = {.lex_state = 0}, + [2] = {.lex_state = 1}, + [3] = {.lex_state = 1}, + [4] = {.lex_state = 1}, + [5] = {.lex_state = 0}, + [6] = {.lex_state = 0}, + [7] = {.lex_state = 0}, + [8] = {.lex_state = 0}, }; static const uint16_t ts_parse_table[LARGE_STATE_COUNT][SYMBOL_COUNT] = { [0] = { [ts_builtin_sym_end] = ACTIONS(1), - [anon_sym_hello] = ACTIONS(1), + [anon_sym_nil] = ACTIONS(1), + [sym_integer_literal] = ACTIONS(1), + [anon_sym_DQUOTE] = ACTIONS(1), + [sym_escape_sequence] = ACTIONS(1), }, [1] = { - [sym_source_file] = STATE(3), - [anon_sym_hello] = ACTIONS(3), + [sym_source_file] = STATE(5), + [sym__expr] = STATE(6), + [sym_string_literal] = STATE(6), + [anon_sym_nil] = ACTIONS(3), + [sym_integer_literal] = ACTIONS(3), + [anon_sym_DQUOTE] = ACTIONS(5), }, }; static const uint16_t ts_small_parse_table[] = { - [0] = 1, - ACTIONS(5), 1, - ts_builtin_sym_end, - [4] = 1, + [0] = 3, ACTIONS(7), 1, + anon_sym_DQUOTE, + STATE(3), 1, + aux_sym_string_literal_repeat1, + ACTIONS(9), 2, + aux_sym_string_literal_token1, + sym_escape_sequence, + [11] = 3, + ACTIONS(11), 1, + anon_sym_DQUOTE, + STATE(4), 1, + aux_sym_string_literal_repeat1, + ACTIONS(13), 2, + aux_sym_string_literal_token1, + sym_escape_sequence, + [22] = 3, + ACTIONS(15), 1, + anon_sym_DQUOTE, + STATE(4), 1, + aux_sym_string_literal_repeat1, + ACTIONS(17), 2, + aux_sym_string_literal_token1, + sym_escape_sequence, + [33] = 1, + ACTIONS(20), 1, + ts_builtin_sym_end, + [37] = 1, + ACTIONS(22), 1, + ts_builtin_sym_end, + [41] = 1, + ACTIONS(24), 1, + ts_builtin_sym_end, + [45] = 1, + ACTIONS(26), 1, ts_builtin_sym_end, }; static const uint32_t ts_small_parse_table_map[] = { [SMALL_STATE(2)] = 0, - [SMALL_STATE(3)] = 4, + [SMALL_STATE(3)] = 11, + [SMALL_STATE(4)] = 22, + [SMALL_STATE(5)] = 33, + [SMALL_STATE(6)] = 37, + [SMALL_STATE(7)] = 41, + [SMALL_STATE(8)] = 45, }; static const TSParseActionEntry ts_parse_actions[] = { [0] = {.entry = {.count = 0, .reusable = false}}, [1] = {.entry = {.count = 1, .reusable = false}}, RECOVER(), - [3] = {.entry = {.count = 1, .reusable = true}}, SHIFT(2), - [5] = {.entry = {.count = 1, .reusable = true}}, REDUCE(sym_source_file, 1), - [7] = {.entry = {.count = 1, .reusable = true}}, ACCEPT_INPUT(), + [3] = {.entry = {.count = 1, .reusable = true}}, SHIFT(6), + [5] = {.entry = {.count = 1, .reusable = true}}, SHIFT(2), + [7] = {.entry = {.count = 1, .reusable = false}}, SHIFT(7), + [9] = {.entry = {.count = 1, .reusable = true}}, SHIFT(3), + [11] = {.entry = {.count = 1, .reusable = false}}, SHIFT(8), + [13] = {.entry = {.count = 1, .reusable = true}}, SHIFT(4), + [15] = {.entry = {.count = 1, .reusable = false}}, REDUCE(aux_sym_string_literal_repeat1, 2), + [17] = {.entry = {.count = 2, .reusable = true}}, REDUCE(aux_sym_string_literal_repeat1, 2), SHIFT_REPEAT(4), + [20] = {.entry = {.count = 1, .reusable = true}}, ACCEPT_INPUT(), + [22] = {.entry = {.count = 1, .reusable = true}}, REDUCE(sym_source_file, 1), + [24] = {.entry = {.count = 1, .reusable = true}}, REDUCE(sym_string_literal, 2), + [26] = {.entry = {.count = 1, .reusable = true}}, REDUCE(sym_string_literal, 3), }; #ifdef __cplusplus diff --git a/test/corpus/literals.txt b/test/corpus/literals.txt new file mode 100644 index 0000000..c4492fb --- /dev/null +++ b/test/corpus/literals.txt @@ -0,0 +1,110 @@ +================================================================================ +Integer literal +================================================================================ + +42 + +-------------------------------------------------------------------------------- + +(source_file + (integer_literal)) + +================================================================================ +String literal +================================================================================ + +"Hello World!" + +-------------------------------------------------------------------------------- + +(source_file + (string_literal)) + +================================================================================ +String literal special character escapes +================================================================================ + +"Hello\nWorld!" + +-------------------------------------------------------------------------------- + +(source_file + (string_literal + (escape_sequence))) + +================================================================================ +String literal octal +================================================================================ + +"Hello World\041" + +-------------------------------------------------------------------------------- + +(source_file + (string_literal + (escape_sequence))) + +================================================================================ +String literal hex +================================================================================ + +"Hello World\x21" + +-------------------------------------------------------------------------------- + +(source_file + (string_literal + (escape_sequence))) + +================================================================================ +String literal character escapes +================================================================================ + +"Hello\\\"World\"" + +-------------------------------------------------------------------------------- + +(source_file + (string_literal + (escape_sequence) + (escape_sequence) + (escape_sequence))) + +================================================================================ +Unterminated string literal +================================================================================ + +" + +-------------------------------------------------------------------------------- + +(source_file + (string_literal + (MISSING """))) + +================================================================================ +String literal unterminated escape +================================================================================ + +"\" + +-------------------------------------------------------------------------------- + +(source_file + (string_literal + (escape_sequence) + (MISSING """))) + +================================================================================ +String literal invalid octal +================================================================================ + +"\399" + +-------------------------------------------------------------------------------- + +(source_file + (ERROR + (UNEXPECTED '9')) + (integer_literal) + (ERROR))