Optional braces

Answered

Created November 02, 2022 14:50

I'm trying to create a grammar for a Scala-like language. I'm struggling with optional braces. Could I get some feedback on the best way to accomplish this?

Here's what I have so far:

BNF

{
  parserClass="com.wavesplatform.rideplugin.parser.RideParser"
  parserUtilClass="com.wavesplatform.rideplugin.parser.RideParserUtil"

  extends="com.intellij.extapi.psi.ASTWrapperPsiElement"

  psiClassPrefix="Ride"
  psiImplClassSuffix="Impl"
  psiPackage="com.wavesplatform.rideplugin.psi"
  psiImplPackage="com.wavesplatform.rideplugin.psi.impl"

  psiImplUtilClass="com.wavesplatform.rideplugin.psi.impl.RidePsiImplUtil"

  implements="com.wavesplatform.rideplugin.psi.RideCompositeElement"
  extends="com.wavesplatform.rideplugin.psi.impl.RideCompositeElementImpl"

  elementTypeHolderClass="com.wavesplatform.rideplugin.psi.RideTypes"
  elementTypeClass="com.wavesplatform.rideplugin.psi.RideElementType"
  tokenTypeClass="com.wavesplatform.rideplugin.psi.RideTokenType"

  elementTypeFactory("var_definition")="com.wavesplatform.rideplugin.psi.impl.RideElementTypeFactory.factory"


  tokens=[
    ASSIGN = "="
    LDBRACKET = "{-#"
    RDBRACKET = "#-}"
    //IDENTIFIER = 'regexp:[a-zA-Z_][a-zA-Z0-9_]*'

    AT_SYMBOL = "@"
    TRUE = "true"
    FALSE = "false"
    IF = "if"
    ELSE = "else"
    UNIT = "unit"
    //(?:'|").*(?:'|")
    //STRING="regexp:([\"'])((?:\\\1|(?:(?!\1)).)*)(\1)"
    //BASE='regexp:base16|base58|base64'
    PERCENT = '%'
    UNDERSCORE = "_"
    LBRACKET = '['
    RBRACKET = ']'
    LBRACE               =  '{'
    RBRACE               =  '}'
    LPAREN               =  '('
    RPAREN               =  ')'
    COLON                =  ':'
    COMMA                =  ','
    EQ                   =  '=='
    NOT_EQ               =  '!='
    BANG                  =  '!'
    PLUS                 =  '+'
    MINUS_MINUS          =  '--'
    MINUS_ASSIGN         =  '-='
    MINUS                =  '-'
    COND_OR              =  '||'
    BIT_OR_ASSIGN        =  '|='
    BIT_CLEAR_ASSIGN     =  '&^='
    BIT_CLEAR            =  '&^'
    COND_AND             =  '&&'
    BIT_AND_ASSIGN       =  '&='
    BIT_AND              =  '&'
    BIT_OR               =  '|'
    MUL                  =  '*'
    SLASH             =  '/'
    GREATER_OR_EQUAL     =  '>='
    LESS_OR_EQUAL     =  '<='
    GT              =  '>'
    LESS                 =  '<'
    DOT                  =  '.'
    CONCAT = '++'
    APPEND = ':+'
    PREPEND = '::'
    FUNCTION = 'func'
    MATCH = 'match'
    CASE = 'case'
    STRICT = 'strict'
    LET = 'let'
    IF = 'if'
    THEN = 'then'
    FOLD_KW = "FOLD"
    NIL = 'nil'
    BOOL="regexp:true|false"
    COMMENT = 'regexp:#.*'
    INTEGER ="regexp:[0-9]+"
    WHITE_SPACE = 'regexp:\s+'
    NEW_LINE = "regexp:(\r|\n|\r\n)"
    STRING = 'regexp:(\")[^\"]*\"'
    SQSTRING = "regexp:(')[^']*'"
    IDENT = 'regexp:[a-zA-Z_][a-zA-Z0-9_]*'
    //INT = "regexp:[0-9]+"
  ]
  extends(".*expr") = expr
}

program ::= all
all ::= directive* element*
private element ::= !<<eof>> statement {pin=2 recoverWhile=property_recover}
private property_recover ::= !(UNIT | NIL | AT_SYMBOL | FUNCTION | LET | STRICT | TRUE | FALSE |IF |ELSE | LDBRACKET
| RDBRACKET | PERCENT | LESS_OR_EQUAL | GREATER_OR_EQUAL
| INT | STRING | IDENT | PLUS | MINUS| BANG | ASTERISK |SLASH | LESS |GT|EQ | LBRACKET | RBRACKET
| NOT_EQ | COMMA | ASSIGN |  COLON | LPAREN |RPAREN |LBRACE |RBRACE | MATCH | CASE | FOLD_KW)

directive ::= LDBRACKET IDENT (IDENT|INTEGER) RDBRACKET

statement ::= var_declaration_statement | expr

var_declaration_statement ::= (LET|STRICT) (tuple_definition | var_definition) ASSIGN (expr | closure) {pin=1}
closure ::= (LBRACE|LPAREN)? block_state (RBRACE|RPAREN)?

tuple_definition ::= LPAREN var_definition (COMMA var_definition)* RPAREN {pin(".*")=1}

var_definition ::= IDENT {
    implements="com.wavesplatform.rideplugin.psi.RideNamedElement"
    extends="com.wavesplatform.rideplugin.psi.impl.RideVarDefinitionBaseImpl"
    stubClass="com.wavesplatform.rideplugin.stubs.RideVarDefinitionStub"
}

expr ::= equal_group
        | index_group
        | conditional_group
        | list_op_group
        | calc_prefix_group
        | definition_group
        | sum_group
        | product_group
        | call_group
        | primary_group
        | prefix_group
        {
        extraRoot=true
        //todo methods=[resolveType]
        }

private equal_group ::= equal_expr | not_equal_expr
equal_expr ::= expr EQ expr
not_equal_expr ::= expr NOT_EQ expr

private conditional_group ::= less_or_eq_expr | more_or_eq_expr | less_expr | more_expr | and_expr | or_expr
less_expr ::= expr LESS expr
more_expr ::= expr GT expr
less_or_eq_expr ::= expr LESS_OR_EQUAL expr
more_or_eq_expr ::= expr GREATER_OR_EQUAL expr
and_expr ::= expr COND_AND expr
or_expr ::= expr COND_OR expr

private list_op_group ::= concat_expr | append_expr | prepend_expr
concat_expr ::= expr CONCAT expr
append_expr ::= expr APPEND expr
prepend_expr ::= expr PREPEND expr

private sum_group ::= plus_expr | minus_expr
plus_expr ::= expr PLUS expr
minus_expr ::= expr MINUS expr

private product_group ::= mul_expr | div_expr | mod_expr
mul_expr ::= expr MUL expr
div_expr ::= expr SLASH expr
mod_expr ::= expr PERCENT expr

private calc_prefix_group ::= unary_min_expr | unary_not_expr
unary_min_expr ::= MINUS expr  {pin = 1}
unary_not_expr ::= BANG expr  {pin = 1}

private call_group ::= call_expr
call_expr ::= field_call | function_call
function_call ::= function_name LPAREN arguments? RPAREN (DOT function_name call_arguments)* {pin=2}
field_call ::= (function_call | IDENT) (DOT IDENT)+
call_arguments ::= LPAREN arguments? RPAREN  {pin = 1}
arguments ::= argument (COMMA argument)*  {pin(".*")=1}
argument ::= expr

private definition_group ::= func_expr | array_expr | pattern_matching_expr | fold_expr | tuple_expr

//todo doesnt work
private index_group ::= index_expr
index_expr ::= expr LBRACKET expr RBRACKET (DOT expr)?

private prefix_group ::= if_expr
if_expr ::= IF LPAREN? if_cond RPAREN? (THEN? closure) else_block? {pin=1}
if_cond ::= expr {pin=1}
else_block ::= ELSE closure {pin = 1}

private primary_group ::=  simple_ref_expr | paren_expr | literal_expr
paren_expr ::= LPAREN expr RPAREN {pin=1}

simple_ref_expr ::= IDENT
//todo {methods=[getReference resolve setName] }

func_expr ::= annotation_expr? FUNCTION IDENT LPAREN param_group? RPAREN ASSIGN closure  {pin=2}
function_name ::= IDENT
block_state ::= element*
param_group ::= param_definition (COMMA param_definition)*  {pin(".*")=1}
param_definition ::= var_definition COLON type {pin(".*")=1}

annotation_expr ::= annotation LPAREN IDENT RPAREN {pin=1}
annotation ::= AT_SYMBOL IDENT {pin=1}

private type ::=  tuple_type | array_type | union_type | simple_type
tuple_type ::= LPAREN type (COMMA type)* RPAREN {pin(".*")=1}
array_type ::= IDENT LBRACKET (type | IDENT) RBRACKET {pin=2}
union_type ::= IDENT BIT_OR (IDENT|NIL|UNIT) (BIT_OR (IDENT|NIL|UNIT))* {pin=2}
simple_type ::= IDENT

array_expr ::= LBRACKET arguments? RBRACKET {pin = 1}

pattern_matching_expr ::= MATCH expr LBRACE case_expr*  RBRACE {pin=1}
case_expr ::= CASE (UNDERSCORE | IDENT) COLON? type? '=>' closure {pin=1}
//case ::= default_case | certain_type_case_definition
//certain_type_case_definition ::= CASE param_definition '=>' statement* {pin=2}
//default_case ::= CASE UNDERSCORE param_definition? '=>' expr* {pin=2}

fold_expr ::= FOLD_KW LESS INTEGER GT call_arguments {pin=1}

tuple_expr ::= call_arguments

literal_expr ::= boolLiteral
          | numericLiteral
          | stringLiteral
          | byteVectorLiteral
          | nilLiteral
          | unitLiteral
stringLiteral ::= STRING
numericLiteral ::= integerLiteral
boolLiteral ::= TRUE | FALSE
integerLiteral ::= INTEGER
byteVectorLiteral ::= (base16|base58|base64) SQSTRING
nilLiteral ::= NIL
unitLiteral ::= UNIT

FLEX

package com.wavesplatform.rideplugin.parser;

import com.intellij.lexer.FlexLexer;
import com.intellij.psi.tree.IElementType;
import com.wavesplatform.rideplugin.psi.RideTypes;

import static com.intellij.psi.TokenType.BAD_CHARACTER;
import static com.intellij.psi.TokenType.WHITE_SPACE;
import static com.wavesplatform.rideplugin.psi.RideTypes.*;

%%

%{
  public _RideLexer() {
    this((java.io.Reader)null);
  }
%}

%public
%class _RideLexer
%implements FlexLexer
%function advance
%type IElementType
%unicode

EOL=\R
WHITE_SPACE=\s+

AT_SYMBOL=@

BOOL=true|false
COMMENT=#.*
INTEGER=[0-9]+
WHITE_SPACE=[ \t\n\x0B\f\r]+
STRING=(\")[^\"]*\"
SQSTRING=(')[^']*'
IDENT=[a-zA-Z_][a-zA-Z0-9_]*

//%state ANNOTATION

//annotation = {AT_SYMBOL}{IDENT}

%%
<YYINITIAL> {
  {WHITE_SPACE}      { return WHITE_SPACE; }

  "="                { return ASSIGN; }
  "{-#"              { return LDBRACKET; }
  "#-}"              { return RDBRACKET; }
  "true"             { return TRUE; }
  "false"            { return FALSE; }
  "if"               { return IF; }
  "else"             { return ELSE; }
  "unit"             { return UNIT; }
  "_"                { return UNDERSCORE; }
  "["                { return LBRACKET; }
  "]"                { return RBRACKET; }
  "{"                { return LBRACE; }
  "}"                { return RBRACE; }
  "("                { return LPAREN; }
  ")"                { return RPAREN; }
  ":"                { return COLON; }
  ","                { return COMMA; }
  "=="               { return EQ; }
  "!="               { return NOT_EQ; }
  "!"                { return BANG; }
  "+"                { return PLUS; }
  "--"               { return MINUS_MINUS; }
  "-="               { return MINUS_ASSIGN; }
  "-"                { return MINUS; }
  "||"               { return COND_OR; }
  "|="               { return BIT_OR_ASSIGN; }
  "&^="              { return BIT_CLEAR_ASSIGN; }
  "&^"               { return BIT_CLEAR; }
  "&&"               { return COND_AND; }
  "&="               { return BIT_AND_ASSIGN; }
  "&"                { return BIT_AND; }
  "|"                { return BIT_OR; }
  "*"                { return MUL; }
  "/"                { return SLASH; }
  "%"                { return PERCENT; }
  ">="               { return GREATER_OR_EQUAL; }
  "<="               { return LESS_OR_EQUAL; }
  ">"                { return GT; }
  "<"                { return LESS; }
  "."                { return DOT; }
  "++"               { return CONCAT; }
  ":+"               { return APPEND; }
  "::"               { return PREPEND; }
  "func"             { return FUNCTION; }
  "match"            { return MATCH; }
  "case"             { return CASE; }
  "strict"           { return STRICT; }
  "let"              { return LET; }
  "then"             { return THEN; }
  "FOLD"             { return FOLD_KW; }
  "nil"              { return NIL; }
  "INT"              { return INT; }
  "ASTERISK"         { return ASTERISK; }
  "base16"           { return BASE16; }
  "base58"           { return BASE58; }
  "base64"           { return BASE64; }

  {BOOL}             { return BOOL; }
  {COMMENT}          { return COMMENT; }
  {INTEGER}          { return INTEGER; }
  {WHITE_SPACE}      { return WHITE_SPACE; }
  {STRING}           { return STRING; }
  {SQSTRING}         { return SQSTRING; }
  {IDENT}            { return IDENT; }
  {AT_SYMBOL}        { return AT_SYMBOL; }

}

[^] { return BAD_CHARACTER; }

The problem is the rule "closure":

closure ::= (LBRACE|LPAREN)? block_state (RBRACE|RPAREN)?

This rule can have optional braces, so for example if statement can be written like this:

if (a % 2 == 0) then accum1 else accum2

Or like this:

if (a % 2 == 0) then 
    { accum1 } 
    else 
    { accum2 }

The problem appears in PSI, when braces are omitted, so parser can’t not identify the end of the closure, like this:

In the current example parser think that close brace belongs to else statement, not the function body.

Could you help me please, with any advice how can I improve my parser, to correct identification of such cases?

Thanks in advance!

2 comments

Karol Lewandowski

Created November 07, 2022 12:50

Hi,

I'm not sure if it is the only issue, but the optional braces/parens for the closure rule are handled strangely.

The current state can match input like LBRACE block_state RPAREN, which I guess is invalid, and other combinations.

Try something like:

closure ::= LBRACE block_state RBRACE 
          | LPAREN block_state RPAREN
          | block_state

So the possibilities are clearly defined.

adevyatkin

Created November 15, 2022 13:23

That helps, thanks a lot!

Please sign in to leave a comment.