ステートマシンを作れる仕組みみたいな。

状態遷移表と、状態と入力に対応する処理を入力するだけでステートマシンが作れるような仕組みを作れないかなあと思いちょっと作ってみました。そして、サンプルとしてダブルクオテーションによるエスケープ処理ができるCSVパーサを作ってみました。

BNFから状態遷移表を出力できれば、これと組み合わせてパーサジェネレータが作れるのかなあとか考えたり。やり方はコンパイラの本でも読めば書いてあるのかなあ・・・

# coding: utf-8
class StateMachine(object):
    def __init__(self, transitiontable, init):
        t = transitiontable
        self.table = {}
        for v in t[1:]:
            self.table[v[0]] = dict(map(lambda x,y:(x,y), t[0][1:], v[1:]))
        self.state = init
    def input_type(self, inp):
        return inp
    def process(self, inp):
        try:
            it = self.input_type(inp)
            prev_state = self.state
            next_state = self.table[self.state][it]
            if getattr(self, 'event_before', False):
                self.event_before(prev_state, next_state, inp)
            self.state = next_state
            if getattr(self, 'event_after', False):
                self.event_after(prev_state, next_state, inp)
        except Exception, e:
            raise e

class CSVParser(StateMachine):
    # ref: http://www.kasai.fm/wiki/rfc4180jp
    def __init__(self):
        table = """
        NAME;INPUT ,     "          OTHER NL    EOF
        NOESC      NOESC ESC        NOESC NOESC EOF
        ESC        ESC   ESC-BRANCH ESC   ESC   ESC
        ESC-BRANCH NOESC ESC        NOESC NOESC EOF
        """
        table = [line.strip().split() for line in table.splitlines()[1:-1]]
        super(CSVParser, self).__init__(table, 'NOESC')
        self.value = []
        self.tmp = []
        self.ncol = None
        self.colcount = 0
        self.newlined = False
    def input_type(self, inp):
        if inp in '\n\r': return 'NL'
        if inp == '\0': return 'EOF'
        if inp in ',"': return inp
        return 'OTHER'
    def newrow(self):
        self.newcol()
        if self.ncol is None:
            self.ncol = self.colcount
        elif self.ncol != self.colcount:
            raise Exception('number of columns does not match')
        self.colcount = 0
        self.newlined = True
    def newcol(self):
        self.colcount += 1
        if len(self.value) == 0:
            self.value = [[]]
        if self.newlined:
            self.value.append([])
            self.newlined = False
        self.value[-1].append(''.join(self.tmp))
        self.tmp = []
    def addbuf(self, ch):
        self.tmp.append(ch)
    def event_after(self, prev, next, inp):
        it = self.input_type(inp)
        c = inp
        if prev == 'NOESC':
            if   it == ','    : self.newcol()
            elif it == '"'    : pass
            elif it == 'OTHER': self.addbuf(c)
            elif it == 'NL'   : self.newrow()
            elif it == 'EOF'  : self.newcol()
        if prev == 'ESC':
            if   it == ','    : self.addbuf(c)
            elif it == '"'    : pass
            elif it == 'OTHER': self.addbuf(c)
            elif it == 'NL'   : self.addbuf(c)
            elif it == 'EOF'  : self.addbuf(c)
        if prev == 'ESC-BRANCH':
            if   it == ','    : self.newcol()
            elif it == '"'    : self.addbuf('"')
            elif it == 'OTHER': pass
            elif it == 'NL'   : self.newrow()
            elif it == 'EOF'  : self.newcol()

def main():
    p = CSVParser()
    inps = u"""a,b,c,d
a",",b"\"\"",c,d
"日""本語","ほげ","ふが","ふげ"
    """
    for ch in inps:
        p.process(ch)
    print p.value
    # [[u'a', u'b', u'c', u'd'], [u'a,', u'b"', u'c', u'd'], [u'\u65e5"\u672c\u8a9e', u'\u307b\u3052', u'\u3075\u304c', u'\u3075\u3052']]

if __name__ == '__main__':
    main()