Add support for nested comments

Unfortunately, the 'extras' array expects single tokens only, so one
can't really use a recursive parsing rule to express nesting comments...
This commit is contained in:
Bruno BELANYI 2022-06-02 16:13:55 +02:00
parent 75bb2c7009
commit 50a0eaa071
5 changed files with 1672 additions and 1661 deletions

View file

@ -25,6 +25,11 @@ module.exports = grammar({
[$._lvalue, $.array_expression],
],
externals: ($) => [
// Nested comments need to be tokenized externally
$.comment,
],
extras: ($) => [
/( |\n|\r|\t)+/,
$.comment,
@ -36,21 +41,6 @@ module.exports = grammar({
optional($._declaration_chunks),
),
comment: ($) => token(
seq(
"/*",
repeat(
choice(
// Match anything but the end-delimiter
/(\*[^/]|[^*])+/,
// Comments can be nested
// $.comment,
),
),
"*/",
),
),
// Expressions {{{
_expr: ($) => choice(

View file

@ -23,34 +23,6 @@
}
]
},
"comment": {
"type": "TOKEN",
"content": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "/*"
},
{
"type": "REPEAT",
"content": {
"type": "CHOICE",
"members": [
{
"type": "PATTERN",
"value": "(\\*[^/]|[^*])+"
}
]
}
},
{
"type": "STRING",
"value": "*/"
}
]
}
},
"_expr": {
"type": "CHOICE",
"members": [
@ -1468,7 +1440,12 @@
]
],
"precedences": [],
"externals": [],
"externals": [
{
"type": "SYMBOL",
"name": "comment"
}
],
"inline": [],
"supertypes": []
}

File diff suppressed because it is too large Load diff

98
src/scanner.c Normal file
View file

@ -0,0 +1,98 @@
#include <stdint.h>
#include <string.h>
#include <tree_sitter/parser.h>
enum TokenType {
COMMENT,
};
static int32_t advance(TSLexer *lexer) {
int32_t last = lexer->lookahead;
lexer->advance(lexer, false);
return last;
}
static bool expect(TSLexer *lexer, int32_t expected) {
if (lexer->eof(lexer) || lexer->lookahead != expected) {
return false;
}
advance(lexer);
return true;
}
static void skip_whitespace(TSLexer *lexer) {
while (!lexer->eof(lexer)) {
switch (lexer->lookahead) {
case ' ':
case '\t':
case '\n':
case '\r':
lexer->advance(lexer, true);
default:
return;
}
}
}
// Comments start with "/*", end with "*/", and can be nested like OCaml
static bool scan_comment(TSLexer *lexer) {
// '/' already consumed outside of the function
if (!expect(lexer, '*')) {
return false; // Division etc...
}
unsigned long level = 1;
while (level > 0 && !lexer->eof(lexer)) {
switch (advance(lexer)) {
case '/':
if (expect(lexer, '*')) {
++level;
}
break;
case '*':
if (expect(lexer, '/')) {
--level;
}
break;
}
}
return level == 0;
}
void *tree_sitter_tiger_external_scanner_create() {
return NULL;
}
void tree_sitter_tiger_external_scanner_destroy(void *payload) {}
unsigned tree_sitter_tiger_external_scanner_serialize(void *payload,
char *buffer) {
return 0;
}
void tree_sitter_tiger_external_scanner_deserialize(void *payload,
char const *buffer,
unsigned length) {}
bool tree_sitter_tiger_external_scanner_scan(void *payload,
TSLexer *lexer,
bool const *valid_symbols) {
// Only try to scan when appropriate
if (!valid_symbols[COMMENT]) {
return false;
}
// Apparently it is expected of us to skip all whitespace by ourselves...
skip_whitespace(lexer);
// Comments start with "/*", scan_comment expects '/' to have been consumed
if (expect(lexer, '/')) {
lexer->result_symbol = COMMENT;
return scan_comment(lexer);
}
return false;
}

View file

@ -51,13 +51,7 @@ Nested comment
--------------------------------------------------------------------------------
(source_file
(comment)
(identifier)
(ERROR
(operator)
(operator)
(operator)
(operator)))
(comment))
================================================================================
Unterminated comment