【记录】尝试用antlr处理C代码中的#include

1.真正去实现的时候，可以参考：

Tips on designing a preprocessor for C++ using Antlr

中的例子，去添加对应的action code。

2.参考：

How to do preprocessing in antlr v4?

A list of all available downlads at Soft Gems

Windows Resource File Parser + Converter

下载到 312KB的rc-converter.zip。

3.又从：

[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)

拷贝了一份代码，供后续参考：

//copy from
//http://www.antlr3.org/pipermail/antlr-interest/2004-July/008778.html
//name to:
//cppLexer.g

// Author: Eric Mahurin
// License: just give me credit

options {
    language="Java";
}

{

import java.io.*;
import java.util.*;
import antlr.*;

class cpp implements cppLexerTokenTypes {
    public static TokenStreamSelector selector = new TokenStreamSelector();
    public static void main(String[] args) {
        try {
            // will need a stack of lexers for #include and macro calls
            cppLexer mainLexer = new cppLexer(new DataInputStream(System.in));
            mainLexer.selector = selector;
            selector.select(mainLexer);
            for (;;) {
                Token t = selector.nextToken();
                if (t.getType()==Token.EOF_TYPE) break;
                System.out.print(t.getText());
            }
        } catch(Exception e) {
            System.err.println("exception: "+e);
        }
    }
}

}

class cppLexer extends Lexer;

options {
    testLiterals = false;
    k = 4;
}

tokens {
    ENDIF ;
}

{
    public static TokenStreamSelector selector; // must be assigned externally
    protected static Integer ifState = 1; // -1: no-else false, 0:false, 1: true
    protected static List ifStates = new ArrayList(); // holds nested if conditions
    protected static Map defines = new Hashtable(); // holds the defines
    protected Map defineArgs = new Hashtable(); // holds the args for a macro call
    public void uponEOF() throws TokenStreamException, CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        } catch (NoSuchElementException e) {
            // return a real EOF if nothing in stack
        }
    }
}

DIRECTIVE {
    List args = new ArrayList();
    boolean condition = true;
} : '#'
    ( "include" (WS)? includeFile:STRING {
        if (ifState==1) {
            // found this in examples/java/includeFile
            String name = includeFile.getText();
            name = name.substring(1,name.length()-1);
            try {
                cppLexer sublexer = new cppLexer(new DataInputStream(new FileInputStream(name)));
                sublexer.defines = defines; // want defines to be persistent
                sublexer.setFilename(name);
                selector.push(sublexer);
                selector.retry();
            } catch (FileNotFoundException fnf) {
                System.err.println("cannot find file "+name);
            }
        }
    }
    | "define" WS defineMacro:RAW_IDENTIFIER
    {
        args.add(""); // first element will hold the macro text
    }
        (
            ( '(' // get arguments if you find them (no spaces before left paren)
                (WS)? defineArg0:RAW_IDENTIFIER (WS)? {args.add(defineArg0.getText());}
                ( COMMA (WS)? defineArg1:RAW_IDENTIFIER (WS)? {args.add(defineArg1.getText());} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText:MACRO_TEXT {args.set(0,defineText.getText());}
        )? '\n' {newline();}
    { if (ifState==1) {
        defines.put( defineMacro.getText(), args );
        $setType(Token.SKIP);
    }}
    | "undef" WS undefMacro:RAW_IDENTIFIER { if (ifState==1) {
        defines.remove(undefMacro.getText());
        $setType(Token.SKIP);
    }}
    | ("ifdef"|"ifndef"{condition=false;})
        WS ifMacro:RAW_IDENTIFIER
    {
        ifStates.add(ifState);
        if (ifState==1) {
            condition = (defines.containsKey(ifMacro.getText())==condition);
            ifState = condition?1:0;
        } else {
            ifState = -1;
        }
        if (ifState==1) {
            $setType(Token.SKIP);
        } else {
            // gobble up tokens until ENDIF (could be caused by else)
            for (;;) {
                try {
                    if (selector.nextToken().getType()==ENDIF) break;
                } catch (TokenStreamRetryException r) {
                    // just continue if someone tried retry
                }
            }
            // retry in case we switched lexers
            selector.retry();
        }
    }
    |
        ( "else" // treat like elsif (true)
        | "elsif" WS elsifMacro:RAW_IDENTIFIER {
            condition=defines.containsKey(elsifMacro.getText());
        }
        )
    {
        if (ifState==1) {
            // previous if/elsif was taken - discard rest
            ifState = -1;
            for (;;) {
                try {
                    if (selector.nextToken().getType()==ENDIF) break;
                } catch (TokenStreamRetryException r) {
                    // just continue if someone tried retry
                }
            }
            // retry in case we switched lexers
            selector.retry();
        } else if (ifState==0 && condition) {
            // "elsif" (true) or "else"
            $setType(ENDIF);
            ifState = 1;
        }
    }
    | "endif" {
        condition = (ifState==1);
        try {
            // return to previous if state
            ifState = (Integer)ifStates.remove(ifStates.size()-1);
            if (condition) {
                $setType(Token.SKIP);
            } else {
                // tell if/else/elsif to stop discarding tokens
                $setType(ENDIF);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            // endif with no if
        }
    }
    );

IDENTIFIER options {testLiterals=true;} {
    List define = new ArrayList();
    List args = new ArrayList();
} :
    identifier:RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
        if (_createToken && define==null) {
            // see if this is a macro call
            define = (List)defines.get(identifier.getText());
        }
    }
    ( { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0:EXPR {args.add(callArg0.getText());}
        ( COMMA callArg1:EXPR {args.add(callArg1.getText());} )*
        { args.size()==define.size()-1 }? // better have right amount
        ')'
    | { !((define!=null) && (define.size()>1)) }?
    )
{ if (define!=null) {
    String defineText = (String)define.get(0);
    if (!_createToken) {
        // just substitute text if called from EXPR - no token created
        $setText(defineText);
    } else {
        // create a new lexer to handle the macro text
        cppLexer sublexer = new cppLexer(new DataInputStream(new StringBufferInputStream(defineText)));
        for (int i=0;i<args.size();++i) {
            // treat macro arguments similar to local defines
            List arg = new ArrayList();
            arg.add((String)args.get(i));
            sublexer.defineArgs.put( (String)define.get(1+i), arg );
        }
        selector.push(sublexer);
        // retry in new lexer
        selector.retry();
    }
}};

STRING
    : '"' ( '\\' . | ~('\\'|'"') )* '"' // double quoted string
    | '\'' ( '\\' . | ~('\\'|'\'') )* '\'' // single quoted string
    ;

protected MACRO_TEXT :
    ( '\\'! '\n' {newline();} // escaped newline
    | ~'\n'
    )*;


WS :
    ( ' '
    | '\t'
    | '\f'
    | '\n' {newline();}
    ) { /*$setType(Token.SKIP);*/ };

COMMENT :
    ( "//" (~'\n')* '\n' {newline();} // single line comment
    | "/*" ( options{greedy=false;} : '\n' {newline();} | ~('\n') )* "*/" // multi-line comment
    ) { /*$setType(Token.SKIP);*/ };

protected RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_')
('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ;

NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha
suffixes on numbers (i.e. L:long)

// group symbols into categories to parse EXPR
LEFT  : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;

protected EXPR // allow just about anything without being ambiguous
    : (WS)? (NUMBER|IDENTIFIER)?
        (
            ( LEFT EXPR ( COMMA EXPR )* RIGHT
            | STRING
            | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
            )
            EXPR
        )?
    ;

4.参考：

How to do preprocessing in antlr v4?

看提到了：

TokenStreamRewriter

不过后来没继续弄。

5.后来是别的同事，搞定了include的，包括递归调用的功能，完整代码如下：

（同时此处后又加了对于define的简单替换）

grammar preprocess;
//lexer grammar preprocess;

options{
	language=Java;
}

@lexer::header {
//package com.mm.antlrv3demo;

import java.io.*;
import java.util.*;
}

@parser::header {
//package com.mm.antlrv3demo;
}

@lexer::members {
	class SaveStruct {
      SaveStruct(CharStream input){
        this.input = input;
        this.marker = input.mark();
      }
      public CharStream input;
      public int marker;
     }
 
     static Map defines = new Hashtable(); // holds the defines
 
     Stack<SaveStruct> includes = new Stack<SaveStruct>();
 
    // We should override this method for handling EOF of included file
     public Token nextToken(){
       Token token = super.nextToken();
 
       if(token.getType() == Token.EOF && !includes.empty()){
        // We've got EOF and have non empty stack.
         SaveStruct ss = includes.pop();
         setCharStream(ss.input);
         input.rewind(ss.marker);
         //this should be used instead of super [like below] to handle exits from nested includes
         //it matters, when the 'include' token is the last in previous stream (using super, lexer 'crashes' returning EOF token)
         token = this.nextToken();
       }
 
      // Skip first token after switching on another input.
      // You need to use this rather than super as there may be nested include files
       if(((CommonToken)token).getStartIndex() < 0)
         token = this.nextToken();
 
       return token;
     }
}

COMMENT
    :   ('//' ~('\n'|'\r')* '\r'? '\n') {skip();}
    |   ('/*' ( options {greedy=false;} : . )* '*/') {skip();}
    ;

// and lexer rule
INCLUDE    :    '#include' (WS)? f=STRING 
{
    String name = f.getText();
    name = name.substring(1,name.length()-1);
    try {
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
 
        // switch on new input stream
        setCharStream(new ANTLRFileStream(name));
        reset();

    } catch(Exception fnf) { throw new Error("Cannot open file " + name); }
};

DIRECTIVE 	:	('#define' WS* defineMacro=ID WS* defineText=STRING)
	{
String macroKey = defineMacro.getText();
String macroValue = defineText.getText();
System.out.println("Found macro: " + macroKey + "=" + macroValue);
defines.put(macroKey, macroValue);
skip();
	};

ID  :	('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*
	{
            // see if this is a macro call
            String idStr = getText();
            if(defines.containsKey(idStr))
            //define = (List)defines.get(identifier.getText());
            //if(define!=null) && (define.size()>1)
            {
            	String macroValue = (String)defines.get(idStr);
            	System.out.println("Found macro reference, so replce " + idStr + " to " + macroValue);

            	setText(macroValue);
            }
	};

INT :	'0'..'9'+
    ;

FLOAT
    :   ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
    |   '.' ('0'..'9')+ EXPONENT?
    |   ('0'..'9')+ EXPONENT
    ;

WS  :   ( ' '
        | '\t'
        | '\r'
        | '\n'
        ) {$channel=HIDDEN;}
    ;

RestSymbo
	:	'{' | '}' | '&' | ';' | ',' | '+' | '-' | ')' | '(' | '~' | '/' | '`' | '$' | '@' | '%' | '^' | '#' | '\\' ;

STRING
    :  '"' ( ESC_SEQ | ~('\\'|'"') )* '"'
    ;

CHAR:  '\'' ( ESC_SEQ | ~('\''|'\\') ) '\''
    ;

fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;

fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;

fragment
ESC_SEQ
    :   '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
    |   UNICODE_ESC
    |   OCTAL_ESC
    ;

fragment
OCTAL_ESC
    :   '\\' ('0'..'3') ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7')
    ;

fragment
UNICODE_ESC
    :   '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
    ;
    
header
	:	include*;
include	:	INCLUDE;//'#include ' '<' ID ('.h' | '.ddl') '>';

【总结】

还是antlr代码，和手动加的action code（java代码）去实现对应的递归处理include的逻辑的。

转载请注明：在路上 » 【记录】尝试用antlr处理C代码中的#include

【记录】尝试用antlr处理C代码中的#include

与本文相关的文章

Hi，您需要填写昵称和邮箱！