#!/usr/bin/env python # -*- coding: utf-8 -*- """ 6D programming language Copyright (C) 2011 Danny Milosavljevic This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . """ import sys import symbols from symbols import intern try: import StringIO as io except ImportError: import io mathUnicodeRanges = [ (0x2200, 0x2300), (0x27C0, 0x27F0), (0x2980, 0x2A00), (0x2A00, 0x2B00), (0x2100, 0x2150), #(0x2308, 0x230C), (0x25A0, 0x2600), (0x2B30, 0x2B4D), #(0x1D400, 0x1D800), ] """ >>> unichr(0x800).encode("utf-8") '\xe0\xa0\x80' >>> unichr(0x00).encode("utf-8") '\x00' >>> unichr(0x900).encode("utf-8") '\xe0\xa4\x80' >>> unichr(0x1000).encode("utf-8") '\xe1\x80\x80' >>> unichr(0x4000).encode("utf-8") '\xe4\x80\x80' >>> unichr(0x9000).encode("utf-8") '\xe9\x80\x80' >>> unichr(0x10000).encode("utf-8") '\xf0\x90\x80\x80' """ def mathUnicodeOperatorInRangeP(codepoint): for s, e in mathUnicodeRanges: if codepoint >= s and codepoint < e: return True return False def UNISKIP(codepoint): return 1 if codepoint < 0x80 else \ 2 if codepoint < 0x800 else \ 3 if codepoint < 0x10000 else \ 4 for s,e in mathUnicodeRanges: assert(UNISKIP(s) == UNISKIP(e)) assert(UNISKIP(s) == 3) manydigits = "0123456789abcdefghijklmnopqrstuvwxyz" # TODO special-case UTF-8 math operators (that they end after the operator char and optional trailing compositing chars) # FIXME parse "b⋅" properly def digitCharP(input): return input and input in "0123456789" def digitRestCharP(input): return input and input in "0123456789." # TODO E+- def specialCodingCharP(input): return input == '#' def asciiIDCharP(input): return input and input in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ?!" def asciiID2CharP(input): return asciiIDCharP(input) or digitCharP(input) def unicodeIDCharP(input): return asciiIDCharP(input) or (ord(input) >= 0x80 and ord(input) != 0xE2) # FIXME don't disallow all U+2xxx def unicodeIDRestCharP(input): return asciiIDCharP(input) or digitCharP(input) or (ord(input) >= 0x80 and ord(input) != 0xE2) # FIXME don't disallow all U+2xxx def unicodeMaybeOperator1CharP(input): return ord(input) == 0xE2 # U+2xxx # TODO 0x1D400 once Python supports it. def operatorCharP(input): return input and input in "$%&*+,-./:;<=>@\\^_`|~" def braceCharP(input): return input and input in "()[]{}" def shebangBodyCharP(input): return input and input != '\n' def octalBodyCharP(input): return input and input in "01234567" def hexadecimalBodyCharP(input): return input and input.lower() in "0123456789abcdef" def binaryBodyCharP(input): return input and input in "01" def stringBodyCharP(input): return input and input != '"' # FIXME escape def raryBodyCharP(digits): return lambda input: input and input.lower() in digits def tokenize(inputFile, env): error = env(intern("error")) def collect(c, condition, text = ""): result = io.StringIO(text) while True: if c == "": break # TODO maybe signal some error else: if condition(c): result.write(c) c = inputFile.read(1) else: break v = result.getvalue() if len(v) > 0: return intern(v), c else: return error(""), c def collect1(condition): c = inputFile.read(1) return collect(c, condition) def collectUnicodeID(c, text): """ it is assumed that text is in entire unicode characters, i.e. not incomplete. """ result = io.StringIO(text) codepoint = 0 while True: if c == "": break # TODO maybe signal some error else: # < 0xC0 continuing byte # >= 0xE0 first byte # < 0x80 first and only byte #FIXME then use mathUnicodeOperatorInRangeP to check. What to do if it is? We just overread a lot of bytes and are probably in the middle of a normal identifier parsing step, too... if unicodeIDRestCharP(c): result.write(c) c = inputFile.read(1) else: break v = result.getvalue() if len(v) > 0: return intern(v), c else: return error(""), c def skipWhitespace(c, indents): """ it is assumed that this function sees all the \\n that concern it. Only after seeing a \\n, it will fiddle with indentation at all. """ indent = None # indents[-1] while True: if c == ' ' or c == '\t': if c == '\t': if indent is None: indent = 0 indent += 1 c = inputFile.read(1) elif c == '\n': indent = 0 val = intern("") c = inputFile.read(1) yield val, c else: if indent is not None: if indent > indents[-1]: indents.append(indent) val = intern("") yield val, c elif indent < indents[-1]: while indent < indents[-1]: indents.pop() val = intern("") yield val, c else: yield None, c return else: yield None, c return def readShebang(): val, c = collect1(shebangBodyCharP) return val, c def collectSpecialCoding(c): c = inputFile.read(1) if c == 'o': return collect2(octalBodyCharP) elif c == 'x': return collect2(hexadecimalBodyCharP ) elif c == '*': return collect2(binaryBodyCharP) elif c == 'b': return collect2(binaryBodyCharP) elif digitCharP(c): # r basis, c = collect(c, digitCharP) if c == 'r' and basis >= 2 and basis <= 36: digits = manydigits[:basis] return collect2(raryBodyCharP(digits)) else: return error(""), c elif c == '!': val, c = readShebang() val = env(intern("#!"))(val) return val, c else: val, c = env(intern("#"))(inputFile, c) return val, c def collectString(c): val, c = collect1(stringBodyCharP) # FIXME quote if c == '"': c = inputFile.read(1) return val, c else: return error(""), c def collectUnicodeOperator3(c): assert(ord(c) == 0xE2) # meaning: 3 bytes total c0 = c c = inputFile.read(1) if c == "": return error("", "") else: c1 = c c = inputFile.read(1) if c == "": return error("", "") else: c2 = c vals = (c0 + c1 + c2) codepoint = ord(vals.decode("utf-8")) if mathUnicodeOperatorInRangeP(codepoint): c = inputFile.read(1) return intern(vals), c else: c = inputFile.read(1) return collectUnicodeID(c, vals) # TODO combining suffixes? indents = [0] while True: c = inputFile.read(1) while True: for val, cc in skipWhitespace(c, indents): if val: yield val c = cc if c == "": return if digitCharP(c): val, c = collect(c, digitRestCharP) elif specialCodingCharP(c): val, c = collectSpecialCoding(c) elif asciiIDCharP(c): val, c = collectUnicodeID(c, "") elif unicodeMaybeOperator1CharP(c): val, c = collectUnicodeOperator3(c) elif unicodeIDCharP(c): val, c = collectUnicodeID(c, "") elif operatorCharP(c): val, c = collect(c, operatorCharP) elif braceCharP(c): val = intern(c) c = inputFile.read(1) elif c == '@': val, c = env(intern("#"))(inputFile, c) #val, c = collect1(lambda input: input and input != ':') # TODO less special chars? elif c == '"': val, c = collectString(c) val = env(intern("stringFromSymbol"))(val) elif c == '\'': val = intern("'") c = inputFile.read(1) else: print(c),ord(c) assert(False) c = inputFile.read(1) #yield intern(c) yield val if __name__ == "__main__": def readHash(inputFile, c): if c == 'e': # exports, probably. if inputFile.read(len("xports")) == "xports": #if inputFile.read(1) == '[': print("FIXME run the entire parser") return None, inputFile.read(1) return (env("error")(""), "") def env(name): return { intern("#"): readHash, intern("#!"): lambda val: None, intern("stringFromSymbol"): lambda val: val.text, intern("error"): lambda *args: (intern("error"), args), }[name] inputFile = open(sys.argv[1], "r") if len(sys.argv) > 1 else sys.stdin co = 31 def str1(val): if isinstance(val, str): return "%r" % val else: return str(val) for val in tokenize(inputFile, env): sys.stdout.write("\033[%dm" % co) co = 32 if co == 31 else 31 sys.stdout.write(str1(val) + " ") if(str(val) == ""): sys.stdout.write("\n") sys.stdout.write("\033[m")