Optional braces

Answered

I'm trying to create a grammar for a Scala-like language.  I'm struggling with optional braces. Could I get some feedback on the best way to accomplish this?

Here's what I have so far:

BNF

{
parserClass="com.wavesplatform.rideplugin.parser.RideParser"
parserUtilClass="com.wavesplatform.rideplugin.parser.RideParserUtil"

extends="com.intellij.extapi.psi.ASTWrapperPsiElement"

psiClassPrefix="Ride"
psiImplClassSuffix="Impl"
psiPackage="com.wavesplatform.rideplugin.psi"
psiImplPackage="com.wavesplatform.rideplugin.psi.impl"

psiImplUtilClass="com.wavesplatform.rideplugin.psi.impl.RidePsiImplUtil"

implements="com.wavesplatform.rideplugin.psi.RideCompositeElement"
extends="com.wavesplatform.rideplugin.psi.impl.RideCompositeElementImpl"

elementTypeHolderClass="com.wavesplatform.rideplugin.psi.RideTypes"
elementTypeClass="com.wavesplatform.rideplugin.psi.RideElementType"
tokenTypeClass="com.wavesplatform.rideplugin.psi.RideTokenType"

elementTypeFactory("var_definition")="com.wavesplatform.rideplugin.psi.impl.RideElementTypeFactory.factory"


tokens=[
ASSIGN = "="
LDBRACKET = "{-#"
RDBRACKET = "#-}"
//IDENTIFIER = 'regexp:[a-zA-Z_][a-zA-Z0-9_]*'

AT_SYMBOL = "@"
TRUE = "true"
FALSE = "false"
IF = "if"
ELSE = "else"
UNIT = "unit"
//(?:'|").*(?:'|")
//STRING="regexp:([\"'])((?:\\\1|(?:(?!\1)).)*)(\1)"
//BASE='regexp:base16|base58|base64'
PERCENT = '%'
UNDERSCORE = "_"
LBRACKET = '['
RBRACKET = ']'
LBRACE = '{'
RBRACE = '}'
LPAREN = '('
RPAREN = ')'
COLON = ':'
COMMA = ','
EQ = '=='
NOT_EQ = '!='
BANG = '!'
PLUS = '+'
MINUS_MINUS = '--'
MINUS_ASSIGN = '-='
MINUS = '-'
COND_OR = '||'
BIT_OR_ASSIGN = '|='
BIT_CLEAR_ASSIGN = '&^='
BIT_CLEAR = '&^'
COND_AND = '&&'
BIT_AND_ASSIGN = '&='
BIT_AND = '&'
BIT_OR = '|'
MUL = '*'
SLASH = '/'
GREATER_OR_EQUAL = '>='
LESS_OR_EQUAL = '<='
GT = '>'
LESS = '<'
DOT = '.'
CONCAT = '++'
APPEND = ':+'
PREPEND = '::'
FUNCTION = 'func'
MATCH = 'match'
CASE = 'case'
STRICT = 'strict'
LET = 'let'
IF = 'if'
THEN = 'then'
FOLD_KW = "FOLD"
NIL = 'nil'
BOOL="regexp:true|false"
COMMENT = 'regexp:#.*'
INTEGER ="regexp:[0-9]+"
WHITE_SPACE = 'regexp:\s+'
NEW_LINE = "regexp:(\r|\n|\r\n)"
STRING = 'regexp:(\")[^\"]*\"'
SQSTRING = "regexp:(')[^']*'"
IDENT = 'regexp:[a-zA-Z_][a-zA-Z0-9_]*'
//INT = "regexp:[0-9]+"
]
extends(".*expr") = expr
}

program ::= all
all ::= directive* element*
private element ::= !<<eof>> statement {pin=2 recoverWhile=property_recover}
private property_recover ::= !(UNIT | NIL | AT_SYMBOL | FUNCTION | LET | STRICT | TRUE | FALSE |IF |ELSE | LDBRACKET
| RDBRACKET | PERCENT | LESS_OR_EQUAL | GREATER_OR_EQUAL
| INT | STRING | IDENT | PLUS | MINUS| BANG | ASTERISK |SLASH | LESS |GT|EQ | LBRACKET | RBRACKET
| NOT_EQ | COMMA | ASSIGN | COLON | LPAREN |RPAREN |LBRACE |RBRACE | MATCH | CASE | FOLD_KW)

directive ::= LDBRACKET IDENT (IDENT|INTEGER) RDBRACKET

statement ::= var_declaration_statement | expr

var_declaration_statement ::= (LET|STRICT) (tuple_definition | var_definition) ASSIGN (expr | closure) {pin=1}
closure ::= (LBRACE|LPAREN)? block_state (RBRACE|RPAREN)?

tuple_definition ::= LPAREN var_definition (COMMA var_definition)* RPAREN {pin(".*")=1}

var_definition ::= IDENT {
implements="com.wavesplatform.rideplugin.psi.RideNamedElement"
extends="com.wavesplatform.rideplugin.psi.impl.RideVarDefinitionBaseImpl"
stubClass="com.wavesplatform.rideplugin.stubs.RideVarDefinitionStub"
}

expr ::= equal_group
| index_group
| conditional_group
| list_op_group
| calc_prefix_group
| definition_group
| sum_group
| product_group
| call_group
| primary_group
| prefix_group
{
extraRoot=true
//todo methods=[resolveType]
}

private equal_group ::= equal_expr | not_equal_expr
equal_expr ::= expr EQ expr
not_equal_expr ::= expr NOT_EQ expr

private conditional_group ::= less_or_eq_expr | more_or_eq_expr | less_expr | more_expr | and_expr | or_expr
less_expr ::= expr LESS expr
more_expr ::= expr GT expr
less_or_eq_expr ::= expr LESS_OR_EQUAL expr
more_or_eq_expr ::= expr GREATER_OR_EQUAL expr
and_expr ::= expr COND_AND expr
or_expr ::= expr COND_OR expr

private list_op_group ::= concat_expr | append_expr | prepend_expr
concat_expr ::= expr CONCAT expr
append_expr ::= expr APPEND expr
prepend_expr ::= expr PREPEND expr

private sum_group ::= plus_expr | minus_expr
plus_expr ::= expr PLUS expr
minus_expr ::= expr MINUS expr

private product_group ::= mul_expr | div_expr | mod_expr
mul_expr ::= expr MUL expr
div_expr ::= expr SLASH expr
mod_expr ::= expr PERCENT expr

private calc_prefix_group ::= unary_min_expr | unary_not_expr
unary_min_expr ::= MINUS expr {pin = 1}
unary_not_expr ::= BANG expr {pin = 1}

private call_group ::= call_expr
call_expr ::= field_call | function_call
function_call ::= function_name LPAREN arguments? RPAREN (DOT function_name call_arguments)* {pin=2}
field_call ::= (function_call | IDENT) (DOT IDENT)+
call_arguments ::= LPAREN arguments? RPAREN {pin = 1}
arguments ::= argument (COMMA argument)* {pin(".*")=1}
argument ::= expr

private definition_group ::= func_expr | array_expr | pattern_matching_expr | fold_expr | tuple_expr

//todo doesnt work
private index_group ::= index_expr
index_expr ::= expr LBRACKET expr RBRACKET (DOT expr)?

private prefix_group ::= if_expr
if_expr ::= IF LPAREN? if_cond RPAREN? (THEN? closure) else_block? {pin=1}
if_cond ::= expr {pin=1}
else_block ::= ELSE closure {pin = 1}

private primary_group ::= simple_ref_expr | paren_expr | literal_expr
paren_expr ::= LPAREN expr RPAREN {pin=1}

simple_ref_expr ::= IDENT
//todo {methods=[getReference resolve setName] }

func_expr ::= annotation_expr? FUNCTION IDENT LPAREN param_group? RPAREN ASSIGN closure {pin=2}
function_name ::= IDENT
block_state ::= element*
param_group ::= param_definition (COMMA param_definition)* {pin(".*")=1}
param_definition ::= var_definition COLON type {pin(".*")=1}

annotation_expr ::= annotation LPAREN IDENT RPAREN {pin=1}
annotation ::= AT_SYMBOL IDENT {pin=1}

private type ::= tuple_type | array_type | union_type | simple_type
tuple_type ::= LPAREN type (COMMA type)* RPAREN {pin(".*")=1}
array_type ::= IDENT LBRACKET (type | IDENT) RBRACKET {pin=2}
union_type ::= IDENT BIT_OR (IDENT|NIL|UNIT) (BIT_OR (IDENT|NIL|UNIT))* {pin=2}
simple_type ::= IDENT

array_expr ::= LBRACKET arguments? RBRACKET {pin = 1}

pattern_matching_expr ::= MATCH expr LBRACE case_expr* RBRACE {pin=1}
case_expr ::= CASE (UNDERSCORE | IDENT) COLON? type? '=>' closure {pin=1}
//case ::= default_case | certain_type_case_definition
//certain_type_case_definition ::= CASE param_definition '=>' statement* {pin=2}
//default_case ::= CASE UNDERSCORE param_definition? '=>' expr* {pin=2}

fold_expr ::= FOLD_KW LESS INTEGER GT call_arguments {pin=1}

tuple_expr ::= call_arguments

literal_expr ::= boolLiteral
| numericLiteral
| stringLiteral
| byteVectorLiteral
| nilLiteral
| unitLiteral
stringLiteral ::= STRING
numericLiteral ::= integerLiteral
boolLiteral ::= TRUE | FALSE
integerLiteral ::= INTEGER
byteVectorLiteral ::= (base16|base58|base64) SQSTRING
nilLiteral ::= NIL
unitLiteral ::= UNIT

FLEX

package com.wavesplatform.rideplugin.parser;

import com.intellij.lexer.FlexLexer;
import com.intellij.psi.tree.IElementType;
import com.wavesplatform.rideplugin.psi.RideTypes;

import static com.intellij.psi.TokenType.BAD_CHARACTER;
import static com.intellij.psi.TokenType.WHITE_SPACE;
import static com.wavesplatform.rideplugin.psi.RideTypes.*;

%%

%{
public _RideLexer() {
this((java.io.Reader)null);
}
%}

%public
%class _RideLexer
%implements FlexLexer
%function advance
%type IElementType
%unicode

EOL=\R
WHITE_SPACE=\s+

AT_SYMBOL=@

BOOL=true|false
COMMENT=#.*
INTEGER=[0-9]+
WHITE_SPACE=[ \t\n\x0B\f\r]+
STRING=(\")[^\"]*\"
SQSTRING=(')[^']*'
IDENT=[a-zA-Z_][a-zA-Z0-9_]*

//%state ANNOTATION

//annotation = {AT_SYMBOL}{IDENT}

%%
<YYINITIAL> {
{WHITE_SPACE} { return WHITE_SPACE; }

"=" { return ASSIGN; }
"{-#" { return LDBRACKET; }
"#-}" { return RDBRACKET; }
"true" { return TRUE; }
"false" { return FALSE; }
"if" { return IF; }
"else" { return ELSE; }
"unit" { return UNIT; }
"_" { return UNDERSCORE; }
"[" { return LBRACKET; }
"]" { return RBRACKET; }
"{" { return LBRACE; }
"}" { return RBRACE; }
"(" { return LPAREN; }
")" { return RPAREN; }
":" { return COLON; }
"," { return COMMA; }
"==" { return EQ; }
"!=" { return NOT_EQ; }
"!" { return BANG; }
"+" { return PLUS; }
"--" { return MINUS_MINUS; }
"-=" { return MINUS_ASSIGN; }
"-" { return MINUS; }
"||" { return COND_OR; }
"|=" { return BIT_OR_ASSIGN; }
"&^=" { return BIT_CLEAR_ASSIGN; }
"&^" { return BIT_CLEAR; }
"&&" { return COND_AND; }
"&=" { return BIT_AND_ASSIGN; }
"&" { return BIT_AND; }
"|" { return BIT_OR; }
"*" { return MUL; }
"/" { return SLASH; }
"%" { return PERCENT; }
">=" { return GREATER_OR_EQUAL; }
"<=" { return LESS_OR_EQUAL; }
">" { return GT; }
"<" { return LESS; }
"." { return DOT; }
"++" { return CONCAT; }
":+" { return APPEND; }
"::" { return PREPEND; }
"func" { return FUNCTION; }
"match" { return MATCH; }
"case" { return CASE; }
"strict" { return STRICT; }
"let" { return LET; }
"then" { return THEN; }
"FOLD" { return FOLD_KW; }
"nil" { return NIL; }
"INT" { return INT; }
"ASTERISK" { return ASTERISK; }
"base16" { return BASE16; }
"base58" { return BASE58; }
"base64" { return BASE64; }

{BOOL} { return BOOL; }
{COMMENT} { return COMMENT; }
{INTEGER} { return INTEGER; }
{WHITE_SPACE} { return WHITE_SPACE; }
{STRING} { return STRING; }
{SQSTRING} { return SQSTRING; }
{IDENT} { return IDENT; }
{AT_SYMBOL} { return AT_SYMBOL; }

}

[^] { return BAD_CHARACTER; }

The problem is the rule "closure":

closure ::= (LBRACE|LPAREN)? block_state (RBRACE|RPAREN)?

This rule can have optional braces, so for example if statement can be written like this:

if (a % 2 == 0) then accum1 else accum2

Or like this:

if (a % 2 == 0) then 
{ accum1 }
else
{ accum2 }

The problem appears in PSI, when braces are omitted, so parser can’t not identify the end of the closure, like this:

In the current example parser think that close brace belongs to else statement, not the function body. 

Could you help me please, with any advice how can I improve my parser, to correct identification of such cases?

Thanks in advance!

0
2 comments

Hi,

I'm not sure if it is the only issue, but the optional braces/parens for the closure rule are handled strangely.

The current state can match input like LBRACE block_state RPAREN, which I guess is invalid, and other combinations.

Try something like:

closure ::= LBRACE block_state RBRACE 
| LPAREN block_state RPAREN
| block_state

So the possibilities are clearly defined.

1

That helps, thanks a lot!

0

Please sign in to leave a comment.