【背景】
需要用antlr实现C语言的预处理:
include,define等等内容。
参考了:
[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)
已经实现了部分的事情。
代码如下:
grammar preprocess;
//lexer grammar preprocess;
options{
language=Java;
output = AST;
}
@lexer::header {
//package com.mm.antlrv3demo;
import java.io.*;
import java.util.*;
}
@parser::header {
//package com.mm.antlrv3demo;
}
@lexer::members {
//public static TokenStreamSelector selector; // must be assigned externally
protected static Integer ifState = 1; // -1: no-else false, 0:false, 1: true
protected static List ifStates = new ArrayList(); // holds nested if conditions
protected static Map defines = new Hashtable(); // holds the defines
protected Map defineArgs = new Hashtable(); // holds the args for a macro call
/*
public void uponEOF() throws TokenStreamException, CharStreamException {
try {
selector.pop(); // return to old lexer/stream
selector.retry();
} catch (NoSuchElementException e) {
// return a real EOF if nothing in stack
}
}
*/
class SaveStruct {
SaveStruct(CharStream input){
this.input = input;
this.marker = input.mark();
}
public CharStream input;
public int marker;
}
Stack<SaveStruct> includes = new Stack<SaveStruct>();
// We should override this method for handling EOF of included file
public Token nextToken(){
Token token = super.nextToken();
if(token.getType() == Token.EOF && !includes.empty()){
// We've got EOF and have non empty stack.
SaveStruct ss = includes.pop();
setCharStream(ss.input);
input.rewind(ss.marker);
//this should be used instead of super [like below] to handle exits from nested includes
//it matters, when the 'include' token is the last in previous stream (using super, lexer 'crashes' returning EOF token)
token = this.nextToken();
}
// Skip first token after switching on another input.
// You need to use this rather than super as there may be nested include files
if(((CommonToken)token).getStartIndex() < 0)
token = this.nextToken();
return token;
}
}
COMMENT
: ('//' ~('\n'|'\r')* '\r'? '\n') {skip();}
| ('/*' ( options {greedy=false;} : . )* '*/') {skip();}
;
// and lexer rule
INCLUDE : '#include' (WS)? f=STRING
{
String name = f.getText();
name = name.substring(1,name.length()-1);
try {
// save current lexer's state
SaveStruct ss = new SaveStruct(input);
includes.push(ss);
// switch on new input stream
setCharStream(new ANTLRFileStream(name));
reset();
} catch(Exception fnf) { throw new Error("Cannot open file " + name); }
};
/*
fragment
NON_CR_LF : ~('\r'|'\n');
fragment
TAB_SPACE
: (' ' | '\t');
*/
//DIRECTIVE : ('#define' WS* defineMacro=ID WS* defineText=STRING)
//DIRECTIVE : ('#define' WS* defineMacro=ID WS* defineText=( NON_CR_LF+ | (NON_CR_LF* (TAB_SPACE+ '\\' '\r'? '\n' NON_CR_LF+)*) ) )
fragment
//MACRO_TEXT : ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n')
//MACRO_TEXT : ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n')
//MACRO_TEXT : ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n')
MACRO_TEXT : (('\\' '\r'? '\n') | (~('\r'|'\n')))*;
//MACRO_TEXT : ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*;
DIRECTIVE @init{
List args = new ArrayList();
boolean condition = true;
} : ('#define' WS* defineMacro=RAW_IDENTIFIER
(
( '(' // get arguments if you find them (no spaces before left paren)
(WS)? defineArg0=RAW_IDENTIFIER (WS)? {args.add(defineArg0.getText());}
( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {args.add(defineArg1.getText());} )*
')'
| ' '|'\t'|'\f'
)
( options{greedy=true;}: ' '|'\t'|'\f' )*
// store the text verbatim - tokenize when called
defineText=MACRO_TEXT {args.set(0,defineText.getText());}
)? '\n'
{
defines.put( defineMacro.getText(), args );
skip();
}
);
IDENTIFIER @init{
List define = new ArrayList();
List args = new ArrayList();
} :
identifier=RAW_IDENTIFIER
{
// see if this is a macro argument
define = (List)defineArgs.get(identifier.getText());
}
( { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
// take in arguments if macro call requires them
'('
callArg0=EXPR {args.add(callArg0.getText());}
( COMMA callArg1=EXPR {args.add(callArg1.getText());} )*
{ args.size()==define.size()-1 }? // better have right amount
')'
| { !((define!=null) && (define.size()>1)) }?
)
{
if (define!=null) {
String defineText = (String)define.get(0);
// create a new lexer to handle the macro text
preprocessLexer sublexer = new preprocessLexer(new DataInputStream(new StringBufferInputStream(defineText)));
for (int i=0;i<args.size();++i) {
// treat macro arguments similar to local defines
List arg = new ArrayList();
arg.add((String)args.get(i));
sublexer.defineArgs.put( (String)define.get(1+i), arg );
}
selector.push(sublexer);
// retry in new lexer
selector.retry();
}
};
fragment RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ;
NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha suffixes on numbers (i.e. L:long)
// group symbols into categories to parse EXPR
LEFT : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;
fragment EXPR // allow just about anything without being ambiguous
: (WS)? (NUMBER|IDENTIFIER)?
(
( LEFT EXPR ( COMMA EXPR )* RIGHT
| STRING
| OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
)
EXPR
)?
;
//INT : '0'..'9'+ ;
FLOAT
: ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
| '.' ('0'..'9')+ EXPONENT?
| ('0'..'9')+ EXPONENT
;
WS : ( ' '
| '\t'
| '\r'
| '\n'
) {$channel=HIDDEN;}
;
//RestSymbo : '{' | '}' | '&' | ';' | ',' | '+' | '-' | ')' | '(' | '~' | '/' | '`' | '$' | '@' | '%' | '^' | '#' | '\\' ;
STRING
: '"' ( ESC_SEQ | ~('\\'|'"') )* '"'
;
CHAR: '\'' ( ESC_SEQ | ~('\''|'\\') ) '\''
;
fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
fragment
ESC_SEQ
: '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
| UNICODE_ESC
| OCTAL_ESC
;
fragment
OCTAL_ESC
: '\\' ('0'..'3') ('0'..'7') ('0'..'7')
| '\\' ('0'..'7') ('0'..'7')
| '\\' ('0'..'7')
;
fragment
UNICODE_ESC
: '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
;
header
: include*;
include : INCLUDE;//'#include ' '<' ID ('.h' | '.ddl') '>';
但是还是遇到很多问题,其中主要就是,针对于旧的antlr v2的TokenStreamSelector,如何换用成antlr v3的逻辑,用哪些函数和类替代。
【折腾过程】
1.关于预处理的问题,这人:
[antlr-interest] C PreProcessor Errors
也遇到类似的事情,但是对此处没啥帮助。
2.这里:
[antlr-interest] ANTLR 3 migration: TokenStreamSelector
和:
[antlr-interest] TokenStreamSelector + ANTLRv3
也提到了,v2转v3时,如何处理TokenStreamSelector,但是没人回答。
3.这里:
Tips on designing a preprocessor for C++ using Antlr
关于预处理,已经解释的很全了,但是还是antlr v2的版本,还是不能完全透彻的理解,还是无法找到TokenStreamSelector的替代品。
4.google搜:
antlr TokenStream Selector deprecated
看到了“Token Stream Multiplexing”,所以,去找找antlr作者写的书
The Definitive ANTLR Reference.pdf
看看其中关于此部分的解释,或许可以找到有价值的参考资料。
5.另外,顺便提示一句,上述代码中的那个:
testLiterals
实际上是antlr v2的语法
根据:
Migrating from ANTLR 2 to ANTLR 3
的某人评论,得知此testLiterals,antlr v3中也没了。
6.参考:
[antlr-interest] v3 – How to deal with include Files?
也讨论了类似问题,但是还是无解。
7.自己看代码,有一点点眉目了:
(1)antlr v2中的处理新的lexer(和tokenStream)的逻辑
public static TokenStreamSelector selector; // must be assigned externally
protected static Map defines = new Hashtable(); // holds the defines
public void uponEOF() throws TokenStreamException, CharStreamException {
try {
selector.pop(); // return to old lexer/stream
selector.retry();
}
......
}
: '#'
( "include" (WS)? includeFile:STRING {
......
try {
cppLexer sublexer = new cppLexer(new DataInputStream(new FileInputStream(name)));
sublexer.defines = defines; // want defines to be persistent
sublexer.setFilename(name);
selector.push(sublexer);
selector.retry();
}
......
}
}
......
} else {
// create a new lexer to handle the macro text
cppLexer sublexer = new cppLexer(new DataInputStream(new StringBufferInputStream(defineText)));
......
selector.push(sublexer);
// retry in new lexer
selector.retry();
}
}};即,主要是:
用new cppLexer新建一个sublexer,
然后初始化一堆东西,比如:
给对应的给全局变量defines去赋值等等
然后就转到新的sublexer去处理了,调用方法是:
先push
再retry
而后,对于新的lexer,都有对应的uponEOF,
其中目的是遇到了EOF,要返回之前的(父级的lexer,所以
先去pop(返回到上一级,父级的lexer)
再去retry(相当于刷新,去使用当前的,父级的lexer)
(2)而与此相对应的,目前已经实现了,antlr v3的,处理新的lexer(和tokenStream)的代码是:
Stack<SaveStruct> includes = new Stack<SaveStruct>();
// We should override this method for handling EOF of included file
public Token nextToken(){
......
if(token.getType() == Token.EOF && !includes.empty()){
// We've got EOF and have non empty stack.
SaveStruct ss = includes.pop();
setCharStream(ss.input);
input.rewind(ss.marker);
......
}
......
}
// and lexer rule
INCLUDE : '#include' (WS)? f=STRING
{
......
try {
// save current lexer's state
SaveStruct ss = new SaveStruct(input);
includes.push(ss);
// switch on new input stream
setCharStream(new ANTLRFileStream(name));
reset();
}
......
};逻辑是:
也是,对于遇到了要include的文件,
类似于新的lexer
然后先去新建一个,全局的那个SaveStruct
将其保存起来,即push,即压栈
然后使用当前新的CharStream
然后用reset,使得回到文件最开始处,再重新处理
这样,就是:
先保存了旧的,父级的lexer(tokenStream)
然后用当前child级别的lexer去处理新的内容
处理完成后,即遇到了EOF
然后会在上面的nextToken中遇到
会去对于全局的变量includes,去pop,拿出来,之前保存的父级的lexer
然后通过setCharStream把后续要处理的内容拿出来
再通过input.rewind,定位到之前记录的位置,
就可以继续去处理了。
以此实现了递归的调用。
而基本明白了递归调用,递归处理父级和子级的lexer或tokenSteam,CharStream的逻辑后,
接下来,就可以,参考两者的不同之处,找到antlr v3中,如何去模拟此套逻辑了。
8.关于cppLexer.g中的多参数的#define实现宏替换的逻辑过程,参见:
【整理】分析cppLexer.g中的多参数的#define实现宏替换的逻辑过程
搞懂逻辑后,接下来,才是,如何将其转化为antlr v3版本的代码,实现同样的逻辑。
9.暂时写了如下代码:
fragment
//MACRO_TEXT : ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n')
//MACRO_TEXT : ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n')
//MACRO_TEXT : ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n')
//MACRO_TEXT : (('\\' '\r'? '\n') | (~('\r'|'\n')))*;
//MACRO_TEXT : (('\\' '\r'? '\n') | (~('\n')))*;
MACRO_TEXT : (('\\' '\n') | (~('\n')))*;
//MACRO_TEXT : ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*;
DIRECTIVE @init{
List args = new ArrayList();
boolean condition = true;
String arg0Text = "";
String arg1Text = "";
String definedContent = "";
String defineId = "";
} : ('#define' WS* defineMacro=RAW_IDENTIFIER
{
args.add(""); // first element will hold the macro text
}
(
( '(' // get arguments if you find them (no spaces before left paren)
(WS)? defineArg0=RAW_IDENTIFIER (WS)? {arg0Text = defineArg0.getText(); args.add(arg0Text);}
( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {arg1Text = defineArg1.getText(); args.add(arg1Text);} )*
')'
| ' '|'\t'|'\f'
)
( options{greedy=true;}: ' '|'\t'|'\f' )*
// store the text verbatim - tokenize when called
defineText=MACRO_TEXT
{
definedContent = defineText.getText();
args.set(0, definedContent);
}
)? '\n'
{
defineId = defineMacro.getText();
defines.put(defineId, args );
skip();
}
);
IDENTIFIER @init{
List define = new ArrayList();
List foundArgs = new ArrayList();
String callArg0Text = "";
String callArg1Text = "";
} :
identifier=RAW_IDENTIFIER
{
// see if this is a macro argument
define = (List)defineArgs.get(identifier.getText());
if (define==null) {
// see if this is a macro call
define = (List)defines.get(identifier.getText());
}
}
( { !((define!=null) && (define.size()>1)) }?
|
{ (define!=null) && (define.size()>1) }? (WS|COMMENT)?
// take in arguments if macro call requires them
'('
callArg0=EXPR
{
callArg0Text = callArg0.getText();
foundArgs.add(callArg0Text);
}
( COMMA callArg1=EXPR
{
callArg1Text = callArg1.getText();
foundArgs.add(callArg1Text);
}
)*
{ foundArgs.size()==define.size()-1 }? // better have right amount
')'
)
{
if (define!=null) {
String defineText = (String)define.get(0);
if (define.size()==1) {
//only have one value in list -> the defineText is the define para content -> just need replace directly
setText(defineText);
} else {
//add new dict pair: (para, call value)
for (int i=0;i<foundArgs.size();++i) {
// treat macro arguments similar to local defines
List arg = new ArrayList();
arg.add((String)foundArgs.get(i));
defineArgs.put( (String)define.get(1+i), arg );
}
// save current lexer's state
SaveStruct ss = new SaveStruct(input);
includes.push(ss);
// switch on new input stream
setCharStream(new ANTLRStringStream(defineText));
reset();
}
}
};但是还没成功,且遇到一个问题:
10.
转载请注明:在路上 » 【记录】将antlr v2的C/C++的preprocess,即cpp.g,转换为antlr v3