ステートマシンを作れる仕組みみたいな。
状態遷移表と、状態と入力に対応する処理を入力するだけでステートマシンが作れるような仕組みを作れないかなあと思いちょっと作ってみました。そして、サンプルとしてダブルクオテーションによるエスケープ処理ができるCSVパーサを作ってみました。
BNFから状態遷移表を出力できれば、これと組み合わせてパーサジェネレータが作れるのかなあとか考えたり。やり方はコンパイラの本でも読めば書いてあるのかなあ・・・
# coding: utf-8 class StateMachine(object): def __init__(self, transitiontable, init): t = transitiontable self.table = {} for v in t[1:]: self.table[v[0]] = dict(map(lambda x,y:(x,y), t[0][1:], v[1:])) self.state = init def input_type(self, inp): return inp def process(self, inp): try: it = self.input_type(inp) prev_state = self.state next_state = self.table[self.state][it] if getattr(self, 'event_before', False): self.event_before(prev_state, next_state, inp) self.state = next_state if getattr(self, 'event_after', False): self.event_after(prev_state, next_state, inp) except Exception, e: raise e class CSVParser(StateMachine): # ref: http://www.kasai.fm/wiki/rfc4180jp def __init__(self): table = """ NAME;INPUT , " OTHER NL EOF NOESC NOESC ESC NOESC NOESC EOF ESC ESC ESC-BRANCH ESC ESC ESC ESC-BRANCH NOESC ESC NOESC NOESC EOF """ table = [line.strip().split() for line in table.splitlines()[1:-1]] super(CSVParser, self).__init__(table, 'NOESC') self.value = [] self.tmp = [] self.ncol = None self.colcount = 0 self.newlined = False def input_type(self, inp): if inp in '\n\r': return 'NL' if inp == '\0': return 'EOF' if inp in ',"': return inp return 'OTHER' def newrow(self): self.newcol() if self.ncol is None: self.ncol = self.colcount elif self.ncol != self.colcount: raise Exception('number of columns does not match') self.colcount = 0 self.newlined = True def newcol(self): self.colcount += 1 if len(self.value) == 0: self.value = [[]] if self.newlined: self.value.append([]) self.newlined = False self.value[-1].append(''.join(self.tmp)) self.tmp = [] def addbuf(self, ch): self.tmp.append(ch) def event_after(self, prev, next, inp): it = self.input_type(inp) c = inp if prev == 'NOESC': if it == ',' : self.newcol() elif it == '"' : pass elif it == 'OTHER': self.addbuf(c) elif it == 'NL' : self.newrow() elif it == 'EOF' : self.newcol() if prev == 'ESC': if it == ',' : self.addbuf(c) elif it == '"' : pass elif it == 'OTHER': self.addbuf(c) elif it == 'NL' : self.addbuf(c) elif it == 'EOF' : self.addbuf(c) if prev == 'ESC-BRANCH': if it == ',' : self.newcol() elif it == '"' : self.addbuf('"') elif it == 'OTHER': pass elif it == 'NL' : self.newrow() elif it == 'EOF' : self.newcol() def main(): p = CSVParser() inps = u"""a,b,c,d a",",b"\"\"",c,d "日""本語","ほげ","ふが","ふげ" """ for ch in inps: p.process(ch) print p.value # [[u'a', u'b', u'c', u'd'], [u'a,', u'b"', u'c', u'd'], [u'\u65e5"\u672c\u8a9e', u'\u307b\u3052', u'\u3075\u304c', u'\u3075\u3052']] if __name__ == '__main__': main()