src2xml/source/srclexer.py

# *************************************************************
#
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.
#
# *************************************************************

import sys, os.path
from globals import *
import macroparser

class EOF(Exception):
    def __init__ (self):
        pass

    def str (self):
        return "end of file"

class BOF(Exception):
    def __init__ (self):
        pass

    def str (self):
        return "beginning of file"


def removeHeaderQuotes (orig):
    if len(orig) <= 2:
        return orig
    elif orig[0] == orig[-1] == '"':
        return orig[1:-1]
    elif orig[0] == '<' and orig[-1] == '>':
        return orig[1:-1]

    return orig


def dumpTokens (tokens, toError=False):

    scope = 0
    indent = "    "
    line = ''
    chars = ''

    for token in tokens:
        if token in '{<':
            if len(line) > 0:
                chars += indent*scope + line + "\n"
                line = ''
            chars += indent*scope + token + "\n"
            scope += 1

        elif token in '}>':
            if len(line) > 0:
                chars += indent*scope + line + "\n"
                line = ''
            scope -= 1
            chars += indent*scope + token

        elif token == ';':
            if len(line) > 0:
                chars += indent*scope + line + ";\n"
                line = ''
            else:
                chars += ";\n"
        elif len(token) > 0:
            line += token + ' '

    if len(line) > 0:
        chars += line
    chars += "\n"
    if toError:
        sys.stderr.write(chars)
    else:
        sys.stdout.write(chars)


class HeaderData(object):
    def __init__ (self):
        self.defines = {}
        self.tokens = []


class SrcLexer(object):
    """Lexicographical analyzer for .src format.

The role of a lexer is to parse the source file and break it into
appropriate tokens.  Such tokens are later passed to a parser to
build the syntax tree.
"""
    headerCache = {}

    VISIBLE = 0
    INVISIBLE_PRE = 1
    INVISIBLE_POST = 2

    def __init__ (self, chars, filepath = None):
        self.filepath = filepath
        self.parentLexer = None
        self.chars = chars
        self.bufsize = len(self.chars)

        # TODO: use parameters for this
        # Properties that can be copied.
        self.headerDict = dict ()
        self.debug = False
        self.debugMacro = False
        self.includeDirs = list ()
        self.expandHeaders = True
        self.inMacroDefine = False
        self.stopOnHeader = False

    def copyProperties (self, other):
        """Copy properties from another instance of SrcLexer."""

        # TODO: use parameters for this
        self.headerDict = other.headerDict
        self.debug = other.debug
        self.debugMacro = other.debugMacro
        self.includeDirs = other.includeDirs[:]
        self.expandHeaders = other.expandHeaders
        self.inMacroDefine = other.inMacroDefine
        self.stopOnHeader = other.stopOnHeader

    def init (self):
        self.firstNonBlank = ''
        self.token = ''
        self.tokens = []
        self.defines = {}
        self.visibilityStack = []

    def getTokens (self):
        return self.tokens

    def getDefines (self):
        return self.defines

    def nextPos (self, i):
        while True:
            i += 1
            try:
                c = self.chars[i]
            except IndexError:
                raise EOF

            if ord(c) in [0x0D]:
                continue
            break
        return i

    def prevPos (self, i):
        while True:
            i -= 1
            try:
                c = self.chars[i]
            except IndexError:
                raise BOF

            if ord(c) in [0x0D]:
                continue
            break
        return i

    def isCodeVisible (self):
        if len(self.visibilityStack) == 0:
            return True
        for item in self.visibilityStack:
            if item != SrcLexer.VISIBLE:
                return False
        return True

    def tokenize (self):
        self.init()

        i = 0
        while True:
            c = self.chars[i]

            if self.firstNonBlank == '' and not c in [' ', "\n", "\t"]:
                # Store the first non-blank in a line.
                self.firstNonBlank = c
            elif c == "\n":
                self.firstNonBlank = ''

            if c == '#':
                i = self.pound(i)
            elif c == '/':
                i = self.slash(i)
            elif c == "\n":
                i = self.lineBreak(i)
            elif c == '"':
                i = self.doubleQuote(i)
            elif c in [' ', "\t"]:
                i = self.blank(i)
            elif c in ";()[]{}<>,=+-*":
                # Any outstanding single-character token.
                i = self.anyToken(i, c)
            elif self.isCodeVisible():
                self.token += c

            try:
                i = self.nextPos(i)
            except EOF:
                break

        if len(self.token):
            self.tokens.append(self.token)

        if not self.parentLexer and self.debug:
            progress ("-"*68 + "\n")
            progress ("All defines found in this translation unit:\n")
            keys = sorted(self.defines.keys())
            for key in keys:
                progress ("@ %s\n"%key)

    def dumpTokens (self, toError=False):
        dumpTokens(self.tokens, toError)


    def maybeAddToken (self):
        if len(self.token) > 0:
            self.tokens.append(self.token)
            self.token = ''


    #--------------------------------------------------------------------
    # character handlers

    def blank (self, i):
        if not self.isCodeVisible():
            return i

        self.maybeAddToken()
        return i


    def pound (self, i):

        if self.inMacroDefine:
            return i

        if not self.firstNonBlank == '#':
            return i

        self.maybeAddToken()
        # We are in preprocessing mode.

        # Get the macro command name '#<command> .....'

        command, define, buf = '', '', ''
        firstNonBlank = False
        while True:
            try:
                i = self.nextPos(i)
                c = self.chars[i]
                if c == '\\' and self.chars[self.nextPos(i)] == "\n":
                    i = self.nextPos(i)
                    continue
            except EOF:
                break

            if c == "\n":
                if len(buf) > 0 and len(command) == 0:
                    command = buf
                i = self.prevPos(i)
                break
            elif c in [' ', "\t"]:
                if not firstNonBlank:
                    # Ignore any leading blanks after the '#'.
                    continue

                if len(command) == 0:
                    command = buf
                    buf = ''
                else:
                    buf += ' '
            elif c == '(':
                if len(buf) > 0 and len(command) == 0:
                    command = buf
                buf += c
            else:
                if not firstNonBlank:
                    firstNonBlank = True
                buf += c

        if command == 'define':
            self.handleMacroDefine(buf)
        elif command == 'include':
            self.handleMacroInclude(buf)
        elif command == 'ifdef':
            defineName = buf.strip()
            if defineName in self.defines:
                self.visibilityStack.append(SrcLexer.VISIBLE)
            else:
                self.visibilityStack.append(SrcLexer.INVISIBLE_PRE)

        elif command == 'ifndef':
            defineName = buf.strip()
            if defineName in self.defines:
                self.visibilityStack.append(SrcLexer.INVISIBLE_PRE)
            else:
                self.visibilityStack.append(SrcLexer.VISIBLE)

        elif command == 'if':
            if self.evalCodeVisibility(buf):
                self.visibilityStack.append(SrcLexer.VISIBLE)
            else:
                self.visibilityStack.append(SrcLexer.INVISIBLE_PRE)

        elif command == 'elif':
            if len(self.visibilityStack) == 0:
                raise ParseError ('')

            if self.visibilityStack[-1] == SrcLexer.VISIBLE:
                self.visibilityStack[-1] = SrcLexer.INVISIBLE_POST
            elif self.visibilityStack[-1] == SrcLexer.INVISIBLE_PRE:
                # Evaluate only if the current visibility is false.
                if self.evalCodeVisibility(buf):
                    self.visibilityStack[-1] = SrcLexer.VISIBLE

        elif command == 'else':
            if len(self.visibilityStack) == 0:
                raise ParseError ('')

            if self.visibilityStack[-1] == SrcLexer.VISIBLE:
                self.visibilityStack[-1] = SrcLexer.INVISIBLE_POST
            if self.visibilityStack[-1] == SrcLexer.INVISIBLE_PRE:
                self.visibilityStack[-1] = SrcLexer.VISIBLE

        elif command == 'endif':
            if len(self.visibilityStack) == 0:
                raise ParseError ('')
            self.visibilityStack.pop()

        elif command == 'undef':
            pass
        elif command in ['error', 'pragma']:
            pass
        else:
            print("'%s' '%s'"%(command, buf))
            print(self.filepath)
            sys.exit(0)

        return i


    def evalCodeVisibility (self, buf):
        try:
            return eval(buf)
        except:
            return True

    def handleMacroDefine (self, buf):

        mparser = macroparser.MacroParser(buf)
        mparser.debug = self.debugMacro
        mparser.parse()
        macro = mparser.getMacro()
        if macro:
            self.defines[macro.name] = macro

    def handleMacroInclude (self, buf):

        # Strip excess string if any.
        pos = buf.find(' ')
        if pos >= 0:
            buf = buf[:pos]
        headerSub = removeHeaderQuotes(buf)

        if not self.expandHeaders:
            # We don't want to expand headers.  Bail out.
            if self.debug:
                progress ("%s ignored\n"%headerSub)
            return

        defines = {}
        headerPath = None
        for includeDir in self.includeDirs:
            hpath = includeDir + '/' + headerSub
            if os.path.isfile(hpath) and hpath != self.filepath:
                headerPath = hpath
                break

        if not headerPath:
            error("included header file " + headerSub + " not found\n", self.stopOnHeader)
            return

        if self.debug:
            progress ("%s found\n"%headerPath)

        if headerPath in self.headerDict:
            if self.debug:
                progress ("%s already included\n"%headerPath)
            return

        if headerPath in SrcLexer.headerCache:
            if self.debug:
                progress ("%s in cache\n"%headerPath)
            for key in list(SrcLexer.headerCache[headerPath].defines.keys()):
                self.defines[key] = SrcLexer.headerCache[headerPath].defines[key]
            return

        chars = open(headerPath, 'r').read()
        mclexer = SrcLexer(chars, headerPath)
        mclexer.copyProperties(self)
        mclexer.parentLexer = self
        mclexer.tokenize()
        hdrData = HeaderData()
        hdrData.tokens = mclexer.getTokens()
        headerDefines = mclexer.getDefines()
        for key in list(headerDefines.keys()):
            defines[key] = headerDefines[key]
            hdrData.defines[key] = headerDefines[key]

        self.headerDict[headerPath] = True
        SrcLexer.headerCache[headerPath] = hdrData

        # Update the list of headers that have already been expaneded.
        for key in list(mclexer.headerDict.keys()):
            self.headerDict[key] = True

        if self.debug:
            progress ("defines found in header %s:\n"%headerSub)
            for key in list(defines.keys()):
                progress ("  '%s'\n"%key)

        for key in list(defines.keys()):
            self.defines[key] = defines[key]


    def slash (self, i):
        if not self.isCodeVisible():
            return i

        if i < self.bufsize - 1 and self.chars[i+1] == '/':
            # Parse line comment.
            line = ''
            i += 2
            while i < self.bufsize:
                c = self.chars[i]
                if ord(c) in [0x0A, 0x0D]:
                    return i - 1
                line += c
                i += 1
            self.token = ''
        elif i < self.bufsize - 1 and self.chars[i+1] == '*':
            comment = ''
            i += 2
            while i < self.bufsize:
                c = self.chars[i]
                if c == '/' and self.chars[i-1] == '*':
                    return i
                comment += c
                i += 1
        else:
            return self.anyToken(i, '/')

        return i


    def lineBreak (self, i):
        if not self.isCodeVisible():
            return i

        self.maybeAddToken()

        return i


    def doubleQuote (self, i):
        if not self.isCodeVisible():
            return i

        literal = ''
        i += 1
        while i < self.bufsize:
            c = self.chars[i]
            if c == '"':
                self.tokens.append('"'+literal+'"')
                break
            literal += c
            i += 1

        return i


    def anyToken (self, i, token):
        if not self.isCodeVisible():
            return i

        self.maybeAddToken()
        self.token = token
        self.maybeAddToken()
        return i