AppPkg/Applications/Python: Add Python 2.7.2 sources since the release of Python 2.7.3 made them unavailable from the python.org web site.
These files are a subset of the python-2.7.2.tgz distribution from python.org. Changed files from PyMod-2.7.2 have been copied into the corresponding directories of this tree, replacing the original files in the distribution. Signed-off-by: daryl.mcdaniel@intel.com git-svn-id: https://edk2.svn.sourceforge.net/svnroot/edk2/trunk/edk2@13197 6f19259b-4bc3-4df7-8a09-765794883524
This commit is contained in:
		| @@ -0,0 +1,4 @@ | ||||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||||
| # Licensed to PSF under a Contributor Agreement. | ||||
|  | ||||
| """The pgen2 package.""" | ||||
| @@ -0,0 +1,257 @@ | ||||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||||
| # Licensed to PSF under a Contributor Agreement. | ||||
|  | ||||
| """Convert graminit.[ch] spit out by pgen to Python code. | ||||
|  | ||||
| Pgen is the Python parser generator.  It is useful to quickly create a | ||||
| parser from a grammar file in Python's grammar notation.  But I don't | ||||
| want my parsers to be written in C (yet), so I'm translating the | ||||
| parsing tables to Python data structures and writing a Python parse | ||||
| engine. | ||||
|  | ||||
| Note that the token numbers are constants determined by the standard | ||||
| Python tokenizer.  The standard token module defines these numbers and | ||||
| their names (the names are not used much).  The token numbers are | ||||
| hardcoded into the Python tokenizer and into pgen.  A Python | ||||
| implementation of the Python tokenizer is also available, in the | ||||
| standard tokenize module. | ||||
|  | ||||
| On the other hand, symbol numbers (representing the grammar's | ||||
| non-terminals) are assigned by pgen based on the actual grammar | ||||
| input. | ||||
|  | ||||
| Note: this module is pretty much obsolete; the pgen module generates | ||||
| equivalent grammar tables directly from the Grammar.txt input file | ||||
| without having to invoke the Python pgen C program. | ||||
|  | ||||
| """ | ||||
|  | ||||
| # Python imports | ||||
| import re | ||||
|  | ||||
| # Local imports | ||||
| from pgen2 import grammar, token | ||||
|  | ||||
|  | ||||
| class Converter(grammar.Grammar): | ||||
|     """Grammar subclass that reads classic pgen output files. | ||||
|  | ||||
|     The run() method reads the tables as produced by the pgen parser | ||||
|     generator, typically contained in two C files, graminit.h and | ||||
|     graminit.c.  The other methods are for internal use only. | ||||
|  | ||||
|     See the base class for more documentation. | ||||
|  | ||||
|     """ | ||||
|  | ||||
|     def run(self, graminit_h, graminit_c): | ||||
|         """Load the grammar tables from the text files written by pgen.""" | ||||
|         self.parse_graminit_h(graminit_h) | ||||
|         self.parse_graminit_c(graminit_c) | ||||
|         self.finish_off() | ||||
|  | ||||
|     def parse_graminit_h(self, filename): | ||||
|         """Parse the .h file written by pgen.  (Internal) | ||||
|  | ||||
|         This file is a sequence of #define statements defining the | ||||
|         nonterminals of the grammar as numbers.  We build two tables | ||||
|         mapping the numbers to names and back. | ||||
|  | ||||
|         """ | ||||
|         try: | ||||
|             f = open(filename) | ||||
|         except IOError, err: | ||||
|             print "Can't open %s: %s" % (filename, err) | ||||
|             return False | ||||
|         self.symbol2number = {} | ||||
|         self.number2symbol = {} | ||||
|         lineno = 0 | ||||
|         for line in f: | ||||
|             lineno += 1 | ||||
|             mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line) | ||||
|             if not mo and line.strip(): | ||||
|                 print "%s(%s): can't parse %s" % (filename, lineno, | ||||
|                                                   line.strip()) | ||||
|             else: | ||||
|                 symbol, number = mo.groups() | ||||
|                 number = int(number) | ||||
|                 assert symbol not in self.symbol2number | ||||
|                 assert number not in self.number2symbol | ||||
|                 self.symbol2number[symbol] = number | ||||
|                 self.number2symbol[number] = symbol | ||||
|         return True | ||||
|  | ||||
|     def parse_graminit_c(self, filename): | ||||
|         """Parse the .c file written by pgen.  (Internal) | ||||
|  | ||||
|         The file looks as follows.  The first two lines are always this: | ||||
|  | ||||
|         #include "pgenheaders.h" | ||||
|         #include "grammar.h" | ||||
|  | ||||
|         After that come four blocks: | ||||
|  | ||||
|         1) one or more state definitions | ||||
|         2) a table defining dfas | ||||
|         3) a table defining labels | ||||
|         4) a struct defining the grammar | ||||
|  | ||||
|         A state definition has the following form: | ||||
|         - one or more arc arrays, each of the form: | ||||
|           static arc arcs_<n>_<m>[<k>] = { | ||||
|                   {<i>, <j>}, | ||||
|                   ... | ||||
|           }; | ||||
|         - followed by a state array, of the form: | ||||
|           static state states_<s>[<t>] = { | ||||
|                   {<k>, arcs_<n>_<m>}, | ||||
|                   ... | ||||
|           }; | ||||
|  | ||||
|         """ | ||||
|         try: | ||||
|             f = open(filename) | ||||
|         except IOError, err: | ||||
|             print "Can't open %s: %s" % (filename, err) | ||||
|             return False | ||||
|         # The code below essentially uses f's iterator-ness! | ||||
|         lineno = 0 | ||||
|  | ||||
|         # Expect the two #include lines | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         assert line == '#include "pgenheaders.h"\n', (lineno, line) | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         assert line == '#include "grammar.h"\n', (lineno, line) | ||||
|  | ||||
|         # Parse the state definitions | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         allarcs = {} | ||||
|         states = [] | ||||
|         while line.startswith("static arc "): | ||||
|             while line.startswith("static arc "): | ||||
|                 mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", | ||||
|                               line) | ||||
|                 assert mo, (lineno, line) | ||||
|                 n, m, k = map(int, mo.groups()) | ||||
|                 arcs = [] | ||||
|                 for _ in range(k): | ||||
|                     lineno, line = lineno+1, f.next() | ||||
|                     mo = re.match(r"\s+{(\d+), (\d+)},$", line) | ||||
|                     assert mo, (lineno, line) | ||||
|                     i, j = map(int, mo.groups()) | ||||
|                     arcs.append((i, j)) | ||||
|                 lineno, line = lineno+1, f.next() | ||||
|                 assert line == "};\n", (lineno, line) | ||||
|                 allarcs[(n, m)] = arcs | ||||
|                 lineno, line = lineno+1, f.next() | ||||
|             mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line) | ||||
|             assert mo, (lineno, line) | ||||
|             s, t = map(int, mo.groups()) | ||||
|             assert s == len(states), (lineno, line) | ||||
|             state = [] | ||||
|             for _ in range(t): | ||||
|                 lineno, line = lineno+1, f.next() | ||||
|                 mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line) | ||||
|                 assert mo, (lineno, line) | ||||
|                 k, n, m = map(int, mo.groups()) | ||||
|                 arcs = allarcs[n, m] | ||||
|                 assert k == len(arcs), (lineno, line) | ||||
|                 state.append(arcs) | ||||
|             states.append(state) | ||||
|             lineno, line = lineno+1, f.next() | ||||
|             assert line == "};\n", (lineno, line) | ||||
|             lineno, line = lineno+1, f.next() | ||||
|         self.states = states | ||||
|  | ||||
|         # Parse the dfas | ||||
|         dfas = {} | ||||
|         mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line) | ||||
|         assert mo, (lineno, line) | ||||
|         ndfas = int(mo.group(1)) | ||||
|         for i in range(ndfas): | ||||
|             lineno, line = lineno+1, f.next() | ||||
|             mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', | ||||
|                           line) | ||||
|             assert mo, (lineno, line) | ||||
|             symbol = mo.group(2) | ||||
|             number, x, y, z = map(int, mo.group(1, 3, 4, 5)) | ||||
|             assert self.symbol2number[symbol] == number, (lineno, line) | ||||
|             assert self.number2symbol[number] == symbol, (lineno, line) | ||||
|             assert x == 0, (lineno, line) | ||||
|             state = states[z] | ||||
|             assert y == len(state), (lineno, line) | ||||
|             lineno, line = lineno+1, f.next() | ||||
|             mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) | ||||
|             assert mo, (lineno, line) | ||||
|             first = {} | ||||
|             rawbitset = eval(mo.group(1)) | ||||
|             for i, c in enumerate(rawbitset): | ||||
|                 byte = ord(c) | ||||
|                 for j in range(8): | ||||
|                     if byte & (1<<j): | ||||
|                         first[i*8 + j] = 1 | ||||
|             dfas[number] = (state, first) | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         assert line == "};\n", (lineno, line) | ||||
|         self.dfas = dfas | ||||
|  | ||||
|         # Parse the labels | ||||
|         labels = [] | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         mo = re.match(r"static label labels\[(\d+)\] = {$", line) | ||||
|         assert mo, (lineno, line) | ||||
|         nlabels = int(mo.group(1)) | ||||
|         for i in range(nlabels): | ||||
|             lineno, line = lineno+1, f.next() | ||||
|             mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) | ||||
|             assert mo, (lineno, line) | ||||
|             x, y = mo.groups() | ||||
|             x = int(x) | ||||
|             if y == "0": | ||||
|                 y = None | ||||
|             else: | ||||
|                 y = eval(y) | ||||
|             labels.append((x, y)) | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         assert line == "};\n", (lineno, line) | ||||
|         self.labels = labels | ||||
|  | ||||
|         # Parse the grammar struct | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         assert line == "grammar _PyParser_Grammar = {\n", (lineno, line) | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         mo = re.match(r"\s+(\d+),$", line) | ||||
|         assert mo, (lineno, line) | ||||
|         ndfas = int(mo.group(1)) | ||||
|         assert ndfas == len(self.dfas) | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         assert line == "\tdfas,\n", (lineno, line) | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         mo = re.match(r"\s+{(\d+), labels},$", line) | ||||
|         assert mo, (lineno, line) | ||||
|         nlabels = int(mo.group(1)) | ||||
|         assert nlabels == len(self.labels), (lineno, line) | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         mo = re.match(r"\s+(\d+)$", line) | ||||
|         assert mo, (lineno, line) | ||||
|         start = int(mo.group(1)) | ||||
|         assert start in self.number2symbol, (lineno, line) | ||||
|         self.start = start | ||||
|         lineno, line = lineno+1, f.next() | ||||
|         assert line == "};\n", (lineno, line) | ||||
|         try: | ||||
|             lineno, line = lineno+1, f.next() | ||||
|         except StopIteration: | ||||
|             pass | ||||
|         else: | ||||
|             assert 0, (lineno, line) | ||||
|  | ||||
|     def finish_off(self): | ||||
|         """Create additional useful structures.  (Internal).""" | ||||
|         self.keywords = {} # map from keyword strings to arc labels | ||||
|         self.tokens = {}   # map from numeric token values to arc labels | ||||
|         for ilabel, (type, value) in enumerate(self.labels): | ||||
|             if type == token.NAME and value is not None: | ||||
|                 self.keywords[value] = ilabel | ||||
|             elif value is None: | ||||
|                 self.tokens[type] = ilabel | ||||
| @@ -0,0 +1,140 @@ | ||||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||||
| # Licensed to PSF under a Contributor Agreement. | ||||
|  | ||||
| # Modifications: | ||||
| # Copyright 2006 Google, Inc. All Rights Reserved. | ||||
| # Licensed to PSF under a Contributor Agreement. | ||||
|  | ||||
| """Parser driver. | ||||
|  | ||||
| This provides a high-level interface to parse a file into a syntax tree. | ||||
|  | ||||
| """ | ||||
|  | ||||
| __author__ = "Guido van Rossum <guido@python.org>" | ||||
|  | ||||
| __all__ = ["Driver", "load_grammar"] | ||||
|  | ||||
| # Python imports | ||||
| import codecs | ||||
| import os | ||||
| import logging | ||||
| import StringIO | ||||
| import sys | ||||
|  | ||||
| # Pgen imports | ||||
| from . import grammar, parse, token, tokenize, pgen | ||||
|  | ||||
|  | ||||
| class Driver(object): | ||||
|  | ||||
|     def __init__(self, grammar, convert=None, logger=None): | ||||
|         self.grammar = grammar | ||||
|         if logger is None: | ||||
|             logger = logging.getLogger() | ||||
|         self.logger = logger | ||||
|         self.convert = convert | ||||
|  | ||||
|     def parse_tokens(self, tokens, debug=False): | ||||
|         """Parse a series of tokens and return the syntax tree.""" | ||||
|         # XXX Move the prefix computation into a wrapper around tokenize. | ||||
|         p = parse.Parser(self.grammar, self.convert) | ||||
|         p.setup() | ||||
|         lineno = 1 | ||||
|         column = 0 | ||||
|         type = value = start = end = line_text = None | ||||
|         prefix = u"" | ||||
|         for quintuple in tokens: | ||||
|             type, value, start, end, line_text = quintuple | ||||
|             if start != (lineno, column): | ||||
|                 assert (lineno, column) <= start, ((lineno, column), start) | ||||
|                 s_lineno, s_column = start | ||||
|                 if lineno < s_lineno: | ||||
|                     prefix += "\n" * (s_lineno - lineno) | ||||
|                     lineno = s_lineno | ||||
|                     column = 0 | ||||
|                 if column < s_column: | ||||
|                     prefix += line_text[column:s_column] | ||||
|                     column = s_column | ||||
|             if type in (tokenize.COMMENT, tokenize.NL): | ||||
|                 prefix += value | ||||
|                 lineno, column = end | ||||
|                 if value.endswith("\n"): | ||||
|                     lineno += 1 | ||||
|                     column = 0 | ||||
|                 continue | ||||
|             if type == token.OP: | ||||
|                 type = grammar.opmap[value] | ||||
|             if debug: | ||||
|                 self.logger.debug("%s %r (prefix=%r)", | ||||
|                                   token.tok_name[type], value, prefix) | ||||
|             if p.addtoken(type, value, (prefix, start)): | ||||
|                 if debug: | ||||
|                     self.logger.debug("Stop.") | ||||
|                 break | ||||
|             prefix = "" | ||||
|             lineno, column = end | ||||
|             if value.endswith("\n"): | ||||
|                 lineno += 1 | ||||
|                 column = 0 | ||||
|         else: | ||||
|             # We never broke out -- EOF is too soon (how can this happen???) | ||||
|             raise parse.ParseError("incomplete input", | ||||
|                                    type, value, (prefix, start)) | ||||
|         return p.rootnode | ||||
|  | ||||
|     def parse_stream_raw(self, stream, debug=False): | ||||
|         """Parse a stream and return the syntax tree.""" | ||||
|         tokens = tokenize.generate_tokens(stream.readline) | ||||
|         return self.parse_tokens(tokens, debug) | ||||
|  | ||||
|     def parse_stream(self, stream, debug=False): | ||||
|         """Parse a stream and return the syntax tree.""" | ||||
|         return self.parse_stream_raw(stream, debug) | ||||
|  | ||||
|     def parse_file(self, filename, encoding=None, debug=False): | ||||
|         """Parse a file and return the syntax tree.""" | ||||
|         stream = codecs.open(filename, "r", encoding) | ||||
|         try: | ||||
|             return self.parse_stream(stream, debug) | ||||
|         finally: | ||||
|             stream.close() | ||||
|  | ||||
|     def parse_string(self, text, debug=False): | ||||
|         """Parse a string and return the syntax tree.""" | ||||
|         tokens = tokenize.generate_tokens(StringIO.StringIO(text).readline) | ||||
|         return self.parse_tokens(tokens, debug) | ||||
|  | ||||
|  | ||||
| def load_grammar(gt="Grammar.txt", gp=None, | ||||
|                  save=True, force=False, logger=None): | ||||
|     """Load the grammar (maybe from a pickle).""" | ||||
|     if logger is None: | ||||
|         logger = logging.getLogger() | ||||
|     if gp is None: | ||||
|         head, tail = os.path.splitext(gt) | ||||
|         if tail == ".txt": | ||||
|             tail = "" | ||||
|         gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle" | ||||
|     if force or not _newer(gp, gt): | ||||
|         logger.info("Generating grammar tables from %s", gt) | ||||
|         g = pgen.generate_grammar(gt) | ||||
|         if save: | ||||
|             logger.info("Writing grammar tables to %s", gp) | ||||
|             try: | ||||
|                 g.dump(gp) | ||||
|             except IOError, e: | ||||
|                 logger.info("Writing failed:"+str(e)) | ||||
|     else: | ||||
|         g = grammar.Grammar() | ||||
|         g.load(gp) | ||||
|     return g | ||||
|  | ||||
|  | ||||
| def _newer(a, b): | ||||
|     """Inquire whether file a was written since file b.""" | ||||
|     if not os.path.exists(a): | ||||
|         return False | ||||
|     if not os.path.exists(b): | ||||
|         return True | ||||
|     return os.path.getmtime(a) >= os.path.getmtime(b) | ||||
| @@ -0,0 +1,184 @@ | ||||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||||
| # Licensed to PSF under a Contributor Agreement. | ||||
|  | ||||
| """This module defines the data structures used to represent a grammar. | ||||
|  | ||||
| These are a bit arcane because they are derived from the data | ||||
| structures used by Python's 'pgen' parser generator. | ||||
|  | ||||
| There's also a table here mapping operators to their names in the | ||||
| token module; the Python tokenize module reports all operators as the | ||||
| fallback token code OP, but the parser needs the actual token code. | ||||
|  | ||||
| """ | ||||
|  | ||||
| # Python imports | ||||
| import pickle | ||||
|  | ||||
| # Local imports | ||||
| from . import token, tokenize | ||||
|  | ||||
|  | ||||
| class Grammar(object): | ||||
|     """Pgen parsing tables tables conversion class. | ||||
|  | ||||
|     Once initialized, this class supplies the grammar tables for the | ||||
|     parsing engine implemented by parse.py.  The parsing engine | ||||
|     accesses the instance variables directly.  The class here does not | ||||
|     provide initialization of the tables; several subclasses exist to | ||||
|     do this (see the conv and pgen modules). | ||||
|  | ||||
|     The load() method reads the tables from a pickle file, which is | ||||
|     much faster than the other ways offered by subclasses.  The pickle | ||||
|     file is written by calling dump() (after loading the grammar | ||||
|     tables using a subclass).  The report() method prints a readable | ||||
|     representation of the tables to stdout, for debugging. | ||||
|  | ||||
|     The instance variables are as follows: | ||||
|  | ||||
|     symbol2number -- a dict mapping symbol names to numbers.  Symbol | ||||
|                      numbers are always 256 or higher, to distinguish | ||||
|                      them from token numbers, which are between 0 and | ||||
|                      255 (inclusive). | ||||
|  | ||||
|     number2symbol -- a dict mapping numbers to symbol names; | ||||
|                      these two are each other's inverse. | ||||
|  | ||||
|     states        -- a list of DFAs, where each DFA is a list of | ||||
|                      states, each state is is a list of arcs, and each | ||||
|                      arc is a (i, j) pair where i is a label and j is | ||||
|                      a state number.  The DFA number is the index into | ||||
|                      this list.  (This name is slightly confusing.) | ||||
|                      Final states are represented by a special arc of | ||||
|                      the form (0, j) where j is its own state number. | ||||
|  | ||||
|     dfas          -- a dict mapping symbol numbers to (DFA, first) | ||||
|                      pairs, where DFA is an item from the states list | ||||
|                      above, and first is a set of tokens that can | ||||
|                      begin this grammar rule (represented by a dict | ||||
|                      whose values are always 1). | ||||
|  | ||||
|     labels        -- a list of (x, y) pairs where x is either a token | ||||
|                      number or a symbol number, and y is either None | ||||
|                      or a string; the strings are keywords.  The label | ||||
|                      number is the index in this list; label numbers | ||||
|                      are used to mark state transitions (arcs) in the | ||||
|                      DFAs. | ||||
|  | ||||
|     start         -- the number of the grammar's start symbol. | ||||
|  | ||||
|     keywords      -- a dict mapping keyword strings to arc labels. | ||||
|  | ||||
|     tokens        -- a dict mapping token numbers to arc labels. | ||||
|  | ||||
|     """ | ||||
|  | ||||
|     def __init__(self): | ||||
|         self.symbol2number = {} | ||||
|         self.number2symbol = {} | ||||
|         self.states = [] | ||||
|         self.dfas = {} | ||||
|         self.labels = [(0, "EMPTY")] | ||||
|         self.keywords = {} | ||||
|         self.tokens = {} | ||||
|         self.symbol2label = {} | ||||
|         self.start = 256 | ||||
|  | ||||
|     def dump(self, filename): | ||||
|         """Dump the grammar tables to a pickle file.""" | ||||
|         f = open(filename, "wb") | ||||
|         pickle.dump(self.__dict__, f, 2) | ||||
|         f.close() | ||||
|  | ||||
|     def load(self, filename): | ||||
|         """Load the grammar tables from a pickle file.""" | ||||
|         f = open(filename, "rb") | ||||
|         d = pickle.load(f) | ||||
|         f.close() | ||||
|         self.__dict__.update(d) | ||||
|  | ||||
|     def copy(self): | ||||
|         """ | ||||
|         Copy the grammar. | ||||
|         """ | ||||
|         new = self.__class__() | ||||
|         for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", | ||||
|                           "tokens", "symbol2label"): | ||||
|             setattr(new, dict_attr, getattr(self, dict_attr).copy()) | ||||
|         new.labels = self.labels[:] | ||||
|         new.states = self.states[:] | ||||
|         new.start = self.start | ||||
|         return new | ||||
|  | ||||
|     def report(self): | ||||
|         """Dump the grammar tables to standard output, for debugging.""" | ||||
|         from pprint import pprint | ||||
|         print "s2n" | ||||
|         pprint(self.symbol2number) | ||||
|         print "n2s" | ||||
|         pprint(self.number2symbol) | ||||
|         print "states" | ||||
|         pprint(self.states) | ||||
|         print "dfas" | ||||
|         pprint(self.dfas) | ||||
|         print "labels" | ||||
|         pprint(self.labels) | ||||
|         print "start", self.start | ||||
|  | ||||
|  | ||||
| # Map from operator to number (since tokenize doesn't do this) | ||||
|  | ||||
| opmap_raw = """ | ||||
| ( LPAR | ||||
| ) RPAR | ||||
| [ LSQB | ||||
| ] RSQB | ||||
| : COLON | ||||
| , COMMA | ||||
| ; SEMI | ||||
| + PLUS | ||||
| - MINUS | ||||
| * STAR | ||||
| / SLASH | ||||
| | VBAR | ||||
| & AMPER | ||||
| < LESS | ||||
| > GREATER | ||||
| = EQUAL | ||||
| . DOT | ||||
| % PERCENT | ||||
| ` BACKQUOTE | ||||
| { LBRACE | ||||
| } RBRACE | ||||
| @ AT | ||||
| == EQEQUAL | ||||
| != NOTEQUAL | ||||
| <> NOTEQUAL | ||||
| <= LESSEQUAL | ||||
| >= GREATEREQUAL | ||||
| ~ TILDE | ||||
| ^ CIRCUMFLEX | ||||
| << LEFTSHIFT | ||||
| >> RIGHTSHIFT | ||||
| ** DOUBLESTAR | ||||
| += PLUSEQUAL | ||||
| -= MINEQUAL | ||||
| *= STAREQUAL | ||||
| /= SLASHEQUAL | ||||
| %= PERCENTEQUAL | ||||
| &= AMPEREQUAL | ||||
| |= VBAREQUAL | ||||
| ^= CIRCUMFLEXEQUAL | ||||
| <<= LEFTSHIFTEQUAL | ||||
| >>= RIGHTSHIFTEQUAL | ||||
| **= DOUBLESTAREQUAL | ||||
| // DOUBLESLASH | ||||
| //= DOUBLESLASHEQUAL | ||||
| -> RARROW | ||||
| """ | ||||
|  | ||||
| opmap = {} | ||||
| for line in opmap_raw.splitlines(): | ||||
|     if line: | ||||
|         op, name = line.split() | ||||
|         opmap[op] = getattr(token, name) | ||||
| @@ -0,0 +1,60 @@ | ||||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||||
| # Licensed to PSF under a Contributor Agreement. | ||||
|  | ||||
| """Safely evaluate Python string literals without using eval().""" | ||||
|  | ||||
| import re | ||||
|  | ||||
| simple_escapes = {"a": "\a", | ||||
|                   "b": "\b", | ||||
|                   "f": "\f", | ||||
|                   "n": "\n", | ||||
|                   "r": "\r", | ||||
|                   "t": "\t", | ||||
|                   "v": "\v", | ||||
|                   "'": "'", | ||||
|                   '"': '"', | ||||
|                   "\\": "\\"} | ||||
|  | ||||
| def escape(m): | ||||
|     all, tail = m.group(0, 1) | ||||
|     assert all.startswith("\\") | ||||
|     esc = simple_escapes.get(tail) | ||||
|     if esc is not None: | ||||
|         return esc | ||||
|     if tail.startswith("x"): | ||||
|         hexes = tail[1:] | ||||
|         if len(hexes) < 2: | ||||
|             raise ValueError("invalid hex string escape ('\\%s')" % tail) | ||||
|         try: | ||||
|             i = int(hexes, 16) | ||||
|         except ValueError: | ||||
|             raise ValueError("invalid hex string escape ('\\%s')" % tail) | ||||
|     else: | ||||
|         try: | ||||
|             i = int(tail, 8) | ||||
|         except ValueError: | ||||
|             raise ValueError("invalid octal string escape ('\\%s')" % tail) | ||||
|     return chr(i) | ||||
|  | ||||
| def evalString(s): | ||||
|     assert s.startswith("'") or s.startswith('"'), repr(s[:1]) | ||||
|     q = s[0] | ||||
|     if s[:3] == q*3: | ||||
|         q = q*3 | ||||
|     assert s.endswith(q), repr(s[-len(q):]) | ||||
|     assert len(s) >= 2*len(q) | ||||
|     s = s[len(q):-len(q)] | ||||
|     return re.sub(r"\\(\'|\"|\\|[abfnrtv]|x.{0,2}|[0-7]{1,3})", escape, s) | ||||
|  | ||||
| def test(): | ||||
|     for i in range(256): | ||||
|         c = chr(i) | ||||
|         s = repr(c) | ||||
|         e = evalString(s) | ||||
|         if e != c: | ||||
|             print i, c, s, e | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     test() | ||||
| @@ -0,0 +1,201 @@ | ||||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||||
| # Licensed to PSF under a Contributor Agreement. | ||||
|  | ||||
| """Parser engine for the grammar tables generated by pgen. | ||||
|  | ||||
| The grammar table must be loaded first. | ||||
|  | ||||
| See Parser/parser.c in the Python distribution for additional info on | ||||
| how this parsing engine works. | ||||
|  | ||||
| """ | ||||
|  | ||||
| # Local imports | ||||
| from . import token | ||||
|  | ||||
| class ParseError(Exception): | ||||
|     """Exception to signal the parser is stuck.""" | ||||
|  | ||||
|     def __init__(self, msg, type, value, context): | ||||
|         Exception.__init__(self, "%s: type=%r, value=%r, context=%r" % | ||||
|                            (msg, type, value, context)) | ||||
|         self.msg = msg | ||||
|         self.type = type | ||||
|         self.value = value | ||||
|         self.context = context | ||||
|  | ||||
| class Parser(object): | ||||
|     """Parser engine. | ||||
|  | ||||
|     The proper usage sequence is: | ||||
|  | ||||
|     p = Parser(grammar, [converter])  # create instance | ||||
|     p.setup([start])                  # prepare for parsing | ||||
|     <for each input token>: | ||||
|         if p.addtoken(...):           # parse a token; may raise ParseError | ||||
|             break | ||||
|     root = p.rootnode                 # root of abstract syntax tree | ||||
|  | ||||
|     A Parser instance may be reused by calling setup() repeatedly. | ||||
|  | ||||
|     A Parser instance contains state pertaining to the current token | ||||
|     sequence, and should not be used concurrently by different threads | ||||
|     to parse separate token sequences. | ||||
|  | ||||
|     See driver.py for how to get input tokens by tokenizing a file or | ||||
|     string. | ||||
|  | ||||
|     Parsing is complete when addtoken() returns True; the root of the | ||||
|     abstract syntax tree can then be retrieved from the rootnode | ||||
|     instance variable.  When a syntax error occurs, addtoken() raises | ||||
|     the ParseError exception.  There is no error recovery; the parser | ||||
|     cannot be used after a syntax error was reported (but it can be | ||||
|     reinitialized by calling setup()). | ||||
|  | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, grammar, convert=None): | ||||
|         """Constructor. | ||||
|  | ||||
|         The grammar argument is a grammar.Grammar instance; see the | ||||
|         grammar module for more information. | ||||
|  | ||||
|         The parser is not ready yet for parsing; you must call the | ||||
|         setup() method to get it started. | ||||
|  | ||||
|         The optional convert argument is a function mapping concrete | ||||
|         syntax tree nodes to abstract syntax tree nodes.  If not | ||||
|         given, no conversion is done and the syntax tree produced is | ||||
|         the concrete syntax tree.  If given, it must be a function of | ||||
|         two arguments, the first being the grammar (a grammar.Grammar | ||||
|         instance), and the second being the concrete syntax tree node | ||||
|         to be converted.  The syntax tree is converted from the bottom | ||||
|         up. | ||||
|  | ||||
|         A concrete syntax tree node is a (type, value, context, nodes) | ||||
|         tuple, where type is the node type (a token or symbol number), | ||||
|         value is None for symbols and a string for tokens, context is | ||||
|         None or an opaque value used for error reporting (typically a | ||||
|         (lineno, offset) pair), and nodes is a list of children for | ||||
|         symbols, and None for tokens. | ||||
|  | ||||
|         An abstract syntax tree node may be anything; this is entirely | ||||
|         up to the converter function. | ||||
|  | ||||
|         """ | ||||
|         self.grammar = grammar | ||||
|         self.convert = convert or (lambda grammar, node: node) | ||||
|  | ||||
|     def setup(self, start=None): | ||||
|         """Prepare for parsing. | ||||
|  | ||||
|         This *must* be called before starting to parse. | ||||
|  | ||||
|         The optional argument is an alternative start symbol; it | ||||
|         defaults to the grammar's start symbol. | ||||
|  | ||||
|         You can use a Parser instance to parse any number of programs; | ||||
|         each time you call setup() the parser is reset to an initial | ||||
|         state determined by the (implicit or explicit) start symbol. | ||||
|  | ||||
|         """ | ||||
|         if start is None: | ||||
|             start = self.grammar.start | ||||
|         # Each stack entry is a tuple: (dfa, state, node). | ||||
|         # A node is a tuple: (type, value, context, children), | ||||
|         # where children is a list of nodes or None, and context may be None. | ||||
|         newnode = (start, None, None, []) | ||||
|         stackentry = (self.grammar.dfas[start], 0, newnode) | ||||
|         self.stack = [stackentry] | ||||
|         self.rootnode = None | ||||
|         self.used_names = set() # Aliased to self.rootnode.used_names in pop() | ||||
|  | ||||
|     def addtoken(self, type, value, context): | ||||
|         """Add a token; return True iff this is the end of the program.""" | ||||
|         # Map from token to label | ||||
|         ilabel = self.classify(type, value, context) | ||||
|         # Loop until the token is shifted; may raise exceptions | ||||
|         while True: | ||||
|             dfa, state, node = self.stack[-1] | ||||
|             states, first = dfa | ||||
|             arcs = states[state] | ||||
|             # Look for a state with this label | ||||
|             for i, newstate in arcs: | ||||
|                 t, v = self.grammar.labels[i] | ||||
|                 if ilabel == i: | ||||
|                     # Look it up in the list of labels | ||||
|                     assert t < 256 | ||||
|                     # Shift a token; we're done with it | ||||
|                     self.shift(type, value, newstate, context) | ||||
|                     # Pop while we are in an accept-only state | ||||
|                     state = newstate | ||||
|                     while states[state] == [(0, state)]: | ||||
|                         self.pop() | ||||
|                         if not self.stack: | ||||
|                             # Done parsing! | ||||
|                             return True | ||||
|                         dfa, state, node = self.stack[-1] | ||||
|                         states, first = dfa | ||||
|                     # Done with this token | ||||
|                     return False | ||||
|                 elif t >= 256: | ||||
|                     # See if it's a symbol and if we're in its first set | ||||
|                     itsdfa = self.grammar.dfas[t] | ||||
|                     itsstates, itsfirst = itsdfa | ||||
|                     if ilabel in itsfirst: | ||||
|                         # Push a symbol | ||||
|                         self.push(t, self.grammar.dfas[t], newstate, context) | ||||
|                         break # To continue the outer while loop | ||||
|             else: | ||||
|                 if (0, state) in arcs: | ||||
|                     # An accepting state, pop it and try something else | ||||
|                     self.pop() | ||||
|                     if not self.stack: | ||||
|                         # Done parsing, but another token is input | ||||
|                         raise ParseError("too much input", | ||||
|                                          type, value, context) | ||||
|                 else: | ||||
|                     # No success finding a transition | ||||
|                     raise ParseError("bad input", type, value, context) | ||||
|  | ||||
|     def classify(self, type, value, context): | ||||
|         """Turn a token into a label.  (Internal)""" | ||||
|         if type == token.NAME: | ||||
|             # Keep a listing of all used names | ||||
|             self.used_names.add(value) | ||||
|             # Check for reserved words | ||||
|             ilabel = self.grammar.keywords.get(value) | ||||
|             if ilabel is not None: | ||||
|                 return ilabel | ||||
|         ilabel = self.grammar.tokens.get(type) | ||||
|         if ilabel is None: | ||||
|             raise ParseError("bad token", type, value, context) | ||||
|         return ilabel | ||||
|  | ||||
|     def shift(self, type, value, newstate, context): | ||||
|         """Shift a token.  (Internal)""" | ||||
|         dfa, state, node = self.stack[-1] | ||||
|         newnode = (type, value, context, None) | ||||
|         newnode = self.convert(self.grammar, newnode) | ||||
|         if newnode is not None: | ||||
|             node[-1].append(newnode) | ||||
|         self.stack[-1] = (dfa, newstate, node) | ||||
|  | ||||
|     def push(self, type, newdfa, newstate, context): | ||||
|         """Push a nonterminal.  (Internal)""" | ||||
|         dfa, state, node = self.stack[-1] | ||||
|         newnode = (type, None, context, []) | ||||
|         self.stack[-1] = (dfa, newstate, node) | ||||
|         self.stack.append((newdfa, 0, newnode)) | ||||
|  | ||||
|     def pop(self): | ||||
|         """Pop a nonterminal.  (Internal)""" | ||||
|         popdfa, popstate, popnode = self.stack.pop() | ||||
|         newnode = self.convert(self.grammar, popnode) | ||||
|         if newnode is not None: | ||||
|             if self.stack: | ||||
|                 dfa, state, node = self.stack[-1] | ||||
|                 node[-1].append(newnode) | ||||
|             else: | ||||
|                 self.rootnode = newnode | ||||
|                 self.rootnode.used_names = self.used_names | ||||
| @@ -0,0 +1,386 @@ | ||||
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | ||||
| # Licensed to PSF under a Contributor Agreement. | ||||
|  | ||||
| # Pgen imports | ||||
| from . import grammar, token, tokenize | ||||
|  | ||||
| class PgenGrammar(grammar.Grammar): | ||||
|     pass | ||||
|  | ||||
| class ParserGenerator(object): | ||||
|  | ||||
|     def __init__(self, filename, stream=None): | ||||
|         close_stream = None | ||||
|         if stream is None: | ||||
|             stream = open(filename) | ||||
|             close_stream = stream.close | ||||
|         self.filename = filename | ||||
|         self.stream = stream | ||||
|         self.generator = tokenize.generate_tokens(stream.readline) | ||||
|         self.gettoken() # Initialize lookahead | ||||
|         self.dfas, self.startsymbol = self.parse() | ||||
|         if close_stream is not None: | ||||
|             close_stream() | ||||
|         self.first = {} # map from symbol name to set of tokens | ||||
|         self.addfirstsets() | ||||
|  | ||||
|     def make_grammar(self): | ||||
|         c = PgenGrammar() | ||||
|         names = self.dfas.keys() | ||||
|         names.sort() | ||||
|         names.remove(self.startsymbol) | ||||
|         names.insert(0, self.startsymbol) | ||||
|         for name in names: | ||||
|             i = 256 + len(c.symbol2number) | ||||
|             c.symbol2number[name] = i | ||||
|             c.number2symbol[i] = name | ||||
|         for name in names: | ||||
|             dfa = self.dfas[name] | ||||
|             states = [] | ||||
|             for state in dfa: | ||||
|                 arcs = [] | ||||
|                 for label, next in state.arcs.iteritems(): | ||||
|                     arcs.append((self.make_label(c, label), dfa.index(next))) | ||||
|                 if state.isfinal: | ||||
|                     arcs.append((0, dfa.index(state))) | ||||
|                 states.append(arcs) | ||||
|             c.states.append(states) | ||||
|             c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) | ||||
|         c.start = c.symbol2number[self.startsymbol] | ||||
|         return c | ||||
|  | ||||
|     def make_first(self, c, name): | ||||
|         rawfirst = self.first[name] | ||||
|         first = {} | ||||
|         for label in rawfirst: | ||||
|             ilabel = self.make_label(c, label) | ||||
|             ##assert ilabel not in first # XXX failed on <> ... != | ||||
|             first[ilabel] = 1 | ||||
|         return first | ||||
|  | ||||
|     def make_label(self, c, label): | ||||
|         # XXX Maybe this should be a method on a subclass of converter? | ||||
|         ilabel = len(c.labels) | ||||
|         if label[0].isalpha(): | ||||
|             # Either a symbol name or a named token | ||||
|             if label in c.symbol2number: | ||||
|                 # A symbol name (a non-terminal) | ||||
|                 if label in c.symbol2label: | ||||
|                     return c.symbol2label[label] | ||||
|                 else: | ||||
|                     c.labels.append((c.symbol2number[label], None)) | ||||
|                     c.symbol2label[label] = ilabel | ||||
|                     return ilabel | ||||
|             else: | ||||
|                 # A named token (NAME, NUMBER, STRING) | ||||
|                 itoken = getattr(token, label, None) | ||||
|                 assert isinstance(itoken, int), label | ||||
|                 assert itoken in token.tok_name, label | ||||
|                 if itoken in c.tokens: | ||||
|                     return c.tokens[itoken] | ||||
|                 else: | ||||
|                     c.labels.append((itoken, None)) | ||||
|                     c.tokens[itoken] = ilabel | ||||
|                     return ilabel | ||||
|         else: | ||||
|             # Either a keyword or an operator | ||||
|             assert label[0] in ('"', "'"), label | ||||
|             value = eval(label) | ||||
|             if value[0].isalpha(): | ||||
|                 # A keyword | ||||
|                 if value in c.keywords: | ||||
|                     return c.keywords[value] | ||||
|                 else: | ||||
|                     c.labels.append((token.NAME, value)) | ||||
|                     c.keywords[value] = ilabel | ||||
|                     return ilabel | ||||
|             else: | ||||
|                 # An operator (any non-numeric token) | ||||
|                 itoken = grammar.opmap[value] # Fails if unknown token | ||||
|                 if itoken in c.tokens: | ||||
|                     return c.tokens[itoken] | ||||
|                 else: | ||||
|                     c.labels.append((itoken, None)) | ||||
|                     c.tokens[itoken] = ilabel | ||||
|                     return ilabel | ||||
|  | ||||
|     def addfirstsets(self): | ||||
|         names = self.dfas.keys() | ||||
|         names.sort() | ||||
|         for name in names: | ||||
|             if name not in self.first: | ||||
|                 self.calcfirst(name) | ||||
|             #print name, self.first[name].keys() | ||||
|  | ||||
|     def calcfirst(self, name): | ||||
|         dfa = self.dfas[name] | ||||
|         self.first[name] = None # dummy to detect left recursion | ||||
|         state = dfa[0] | ||||
|         totalset = {} | ||||
|         overlapcheck = {} | ||||
|         for label, next in state.arcs.iteritems(): | ||||
|             if label in self.dfas: | ||||
|                 if label in self.first: | ||||
|                     fset = self.first[label] | ||||
|                     if fset is None: | ||||
|                         raise ValueError("recursion for rule %r" % name) | ||||
|                 else: | ||||
|                     self.calcfirst(label) | ||||
|                     fset = self.first[label] | ||||
|                 totalset.update(fset) | ||||
|                 overlapcheck[label] = fset | ||||
|             else: | ||||
|                 totalset[label] = 1 | ||||
|                 overlapcheck[label] = {label: 1} | ||||
|         inverse = {} | ||||
|         for label, itsfirst in overlapcheck.iteritems(): | ||||
|             for symbol in itsfirst: | ||||
|                 if symbol in inverse: | ||||
|                     raise ValueError("rule %s is ambiguous; %s is in the" | ||||
|                                      " first sets of %s as well as %s" % | ||||
|                                      (name, symbol, label, inverse[symbol])) | ||||
|                 inverse[symbol] = label | ||||
|         self.first[name] = totalset | ||||
|  | ||||
|     def parse(self): | ||||
|         dfas = {} | ||||
|         startsymbol = None | ||||
|         # MSTART: (NEWLINE | RULE)* ENDMARKER | ||||
|         while self.type != token.ENDMARKER: | ||||
|             while self.type == token.NEWLINE: | ||||
|                 self.gettoken() | ||||
|             # RULE: NAME ':' RHS NEWLINE | ||||
|             name = self.expect(token.NAME) | ||||
|             self.expect(token.OP, ":") | ||||
|             a, z = self.parse_rhs() | ||||
|             self.expect(token.NEWLINE) | ||||
|             #self.dump_nfa(name, a, z) | ||||
|             dfa = self.make_dfa(a, z) | ||||
|             #self.dump_dfa(name, dfa) | ||||
|             oldlen = len(dfa) | ||||
|             self.simplify_dfa(dfa) | ||||
|             newlen = len(dfa) | ||||
|             dfas[name] = dfa | ||||
|             #print name, oldlen, newlen | ||||
|             if startsymbol is None: | ||||
|                 startsymbol = name | ||||
|         return dfas, startsymbol | ||||
|  | ||||
|     def make_dfa(self, start, finish): | ||||
|         # To turn an NFA into a DFA, we define the states of the DFA | ||||
|         # to correspond to *sets* of states of the NFA.  Then do some | ||||
|         # state reduction.  Let's represent sets as dicts with 1 for | ||||
|         # values. | ||||
|         assert isinstance(start, NFAState) | ||||
|         assert isinstance(finish, NFAState) | ||||
|         def closure(state): | ||||
|             base = {} | ||||
|             addclosure(state, base) | ||||
|             return base | ||||
|         def addclosure(state, base): | ||||
|             assert isinstance(state, NFAState) | ||||
|             if state in base: | ||||
|                 return | ||||
|             base[state] = 1 | ||||
|             for label, next in state.arcs: | ||||
|                 if label is None: | ||||
|                     addclosure(next, base) | ||||
|         states = [DFAState(closure(start), finish)] | ||||
|         for state in states: # NB states grows while we're iterating | ||||
|             arcs = {} | ||||
|             for nfastate in state.nfaset: | ||||
|                 for label, next in nfastate.arcs: | ||||
|                     if label is not None: | ||||
|                         addclosure(next, arcs.setdefault(label, {})) | ||||
|             for label, nfaset in arcs.iteritems(): | ||||
|                 for st in states: | ||||
|                     if st.nfaset == nfaset: | ||||
|                         break | ||||
|                 else: | ||||
|                     st = DFAState(nfaset, finish) | ||||
|                     states.append(st) | ||||
|                 state.addarc(st, label) | ||||
|         return states # List of DFAState instances; first one is start | ||||
|  | ||||
|     def dump_nfa(self, name, start, finish): | ||||
|         print "Dump of NFA for", name | ||||
|         todo = [start] | ||||
|         for i, state in enumerate(todo): | ||||
|             print "  State", i, state is finish and "(final)" or "" | ||||
|             for label, next in state.arcs: | ||||
|                 if next in todo: | ||||
|                     j = todo.index(next) | ||||
|                 else: | ||||
|                     j = len(todo) | ||||
|                     todo.append(next) | ||||
|                 if label is None: | ||||
|                     print "    -> %d" % j | ||||
|                 else: | ||||
|                     print "    %s -> %d" % (label, j) | ||||
|  | ||||
|     def dump_dfa(self, name, dfa): | ||||
|         print "Dump of DFA for", name | ||||
|         for i, state in enumerate(dfa): | ||||
|             print "  State", i, state.isfinal and "(final)" or "" | ||||
|             for label, next in state.arcs.iteritems(): | ||||
|                 print "    %s -> %d" % (label, dfa.index(next)) | ||||
|  | ||||
|     def simplify_dfa(self, dfa): | ||||
|         # This is not theoretically optimal, but works well enough. | ||||
|         # Algorithm: repeatedly look for two states that have the same | ||||
|         # set of arcs (same labels pointing to the same nodes) and | ||||
|         # unify them, until things stop changing. | ||||
|  | ||||
|         # dfa is a list of DFAState instances | ||||
|         changes = True | ||||
|         while changes: | ||||
|             changes = False | ||||
|             for i, state_i in enumerate(dfa): | ||||
|                 for j in range(i+1, len(dfa)): | ||||
|                     state_j = dfa[j] | ||||
|                     if state_i == state_j: | ||||
|                         #print "  unify", i, j | ||||
|                         del dfa[j] | ||||
|                         for state in dfa: | ||||
|                             state.unifystate(state_j, state_i) | ||||
|                         changes = True | ||||
|                         break | ||||
|  | ||||
|     def parse_rhs(self): | ||||
|         # RHS: ALT ('|' ALT)* | ||||
|         a, z = self.parse_alt() | ||||
|         if self.value != "|": | ||||
|             return a, z | ||||
|         else: | ||||
|             aa = NFAState() | ||||
|             zz = NFAState() | ||||
|             aa.addarc(a) | ||||
|             z.addarc(zz) | ||||
|             while self.value == "|": | ||||
|                 self.gettoken() | ||||
|                 a, z = self.parse_alt() | ||||
|                 aa.addarc(a) | ||||
|                 z.addarc(zz) | ||||
|             return aa, zz | ||||
|  | ||||
|     def parse_alt(self): | ||||
|         # ALT: ITEM+ | ||||
|         a, b = self.parse_item() | ||||
|         while (self.value in ("(", "[") or | ||||
|                self.type in (token.NAME, token.STRING)): | ||||
|             c, d = self.parse_item() | ||||
|             b.addarc(c) | ||||
|             b = d | ||||
|         return a, b | ||||
|  | ||||
|     def parse_item(self): | ||||
|         # ITEM: '[' RHS ']' | ATOM ['+' | '*'] | ||||
|         if self.value == "[": | ||||
|             self.gettoken() | ||||
|             a, z = self.parse_rhs() | ||||
|             self.expect(token.OP, "]") | ||||
|             a.addarc(z) | ||||
|             return a, z | ||||
|         else: | ||||
|             a, z = self.parse_atom() | ||||
|             value = self.value | ||||
|             if value not in ("+", "*"): | ||||
|                 return a, z | ||||
|             self.gettoken() | ||||
|             z.addarc(a) | ||||
|             if value == "+": | ||||
|                 return a, z | ||||
|             else: | ||||
|                 return a, a | ||||
|  | ||||
|     def parse_atom(self): | ||||
|         # ATOM: '(' RHS ')' | NAME | STRING | ||||
|         if self.value == "(": | ||||
|             self.gettoken() | ||||
|             a, z = self.parse_rhs() | ||||
|             self.expect(token.OP, ")") | ||||
|             return a, z | ||||
|         elif self.type in (token.NAME, token.STRING): | ||||
|             a = NFAState() | ||||
|             z = NFAState() | ||||
|             a.addarc(z, self.value) | ||||
|             self.gettoken() | ||||
|             return a, z | ||||
|         else: | ||||
|             self.raise_error("expected (...) or NAME or STRING, got %s/%s", | ||||
|                              self.type, self.value) | ||||
|  | ||||
|     def expect(self, type, value=None): | ||||
|         if self.type != type or (value is not None and self.value != value): | ||||
|             self.raise_error("expected %s/%s, got %s/%s", | ||||
|                              type, value, self.type, self.value) | ||||
|         value = self.value | ||||
|         self.gettoken() | ||||
|         return value | ||||
|  | ||||
|     def gettoken(self): | ||||
|         tup = self.generator.next() | ||||
|         while tup[0] in (tokenize.COMMENT, tokenize.NL): | ||||
|             tup = self.generator.next() | ||||
|         self.type, self.value, self.begin, self.end, self.line = tup | ||||
|         #print token.tok_name[self.type], repr(self.value) | ||||
|  | ||||
|     def raise_error(self, msg, *args): | ||||
|         if args: | ||||
|             try: | ||||
|                 msg = msg % args | ||||
|             except: | ||||
|                 msg = " ".join([msg] + map(str, args)) | ||||
|         raise SyntaxError(msg, (self.filename, self.end[0], | ||||
|                                 self.end[1], self.line)) | ||||
|  | ||||
| class NFAState(object): | ||||
|  | ||||
|     def __init__(self): | ||||
|         self.arcs = [] # list of (label, NFAState) pairs | ||||
|  | ||||
|     def addarc(self, next, label=None): | ||||
|         assert label is None or isinstance(label, str) | ||||
|         assert isinstance(next, NFAState) | ||||
|         self.arcs.append((label, next)) | ||||
|  | ||||
| class DFAState(object): | ||||
|  | ||||
|     def __init__(self, nfaset, final): | ||||
|         assert isinstance(nfaset, dict) | ||||
|         assert isinstance(iter(nfaset).next(), NFAState) | ||||
|         assert isinstance(final, NFAState) | ||||
|         self.nfaset = nfaset | ||||
|         self.isfinal = final in nfaset | ||||
|         self.arcs = {} # map from label to DFAState | ||||
|  | ||||
|     def addarc(self, next, label): | ||||
|         assert isinstance(label, str) | ||||
|         assert label not in self.arcs | ||||
|         assert isinstance(next, DFAState) | ||||
|         self.arcs[label] = next | ||||
|  | ||||
|     def unifystate(self, old, new): | ||||
|         for label, next in self.arcs.iteritems(): | ||||
|             if next is old: | ||||
|                 self.arcs[label] = new | ||||
|  | ||||
|     def __eq__(self, other): | ||||
|         # Equality test -- ignore the nfaset instance variable | ||||
|         assert isinstance(other, DFAState) | ||||
|         if self.isfinal != other.isfinal: | ||||
|             return False | ||||
|         # Can't just return self.arcs == other.arcs, because that | ||||
|         # would invoke this method recursively, with cycles... | ||||
|         if len(self.arcs) != len(other.arcs): | ||||
|             return False | ||||
|         for label, next in self.arcs.iteritems(): | ||||
|             if next is not other.arcs.get(label): | ||||
|                 return False | ||||
|         return True | ||||
|  | ||||
|     __hash__ = None # For Py3 compatibility. | ||||
|  | ||||
| def generate_grammar(filename="Grammar.txt"): | ||||
|     p = ParserGenerator(filename) | ||||
|     return p.make_grammar() | ||||
| @@ -0,0 +1,82 @@ | ||||
| #! /usr/bin/env python | ||||
|  | ||||
| """Token constants (from "token.h").""" | ||||
|  | ||||
| #  Taken from Python (r53757) and modified to include some tokens | ||||
| #   originally monkeypatched in by pgen2.tokenize | ||||
|  | ||||
| #--start constants-- | ||||
| ENDMARKER = 0 | ||||
| NAME = 1 | ||||
| NUMBER = 2 | ||||
| STRING = 3 | ||||
| NEWLINE = 4 | ||||
| INDENT = 5 | ||||
| DEDENT = 6 | ||||
| LPAR = 7 | ||||
| RPAR = 8 | ||||
| LSQB = 9 | ||||
| RSQB = 10 | ||||
| COLON = 11 | ||||
| COMMA = 12 | ||||
| SEMI = 13 | ||||
| PLUS = 14 | ||||
| MINUS = 15 | ||||
| STAR = 16 | ||||
| SLASH = 17 | ||||
| VBAR = 18 | ||||
| AMPER = 19 | ||||
| LESS = 20 | ||||
| GREATER = 21 | ||||
| EQUAL = 22 | ||||
| DOT = 23 | ||||
| PERCENT = 24 | ||||
| BACKQUOTE = 25 | ||||
| LBRACE = 26 | ||||
| RBRACE = 27 | ||||
| EQEQUAL = 28 | ||||
| NOTEQUAL = 29 | ||||
| LESSEQUAL = 30 | ||||
| GREATEREQUAL = 31 | ||||
| TILDE = 32 | ||||
| CIRCUMFLEX = 33 | ||||
| LEFTSHIFT = 34 | ||||
| RIGHTSHIFT = 35 | ||||
| DOUBLESTAR = 36 | ||||
| PLUSEQUAL = 37 | ||||
| MINEQUAL = 38 | ||||
| STAREQUAL = 39 | ||||
| SLASHEQUAL = 40 | ||||
| PERCENTEQUAL = 41 | ||||
| AMPEREQUAL = 42 | ||||
| VBAREQUAL = 43 | ||||
| CIRCUMFLEXEQUAL = 44 | ||||
| LEFTSHIFTEQUAL = 45 | ||||
| RIGHTSHIFTEQUAL = 46 | ||||
| DOUBLESTAREQUAL = 47 | ||||
| DOUBLESLASH = 48 | ||||
| DOUBLESLASHEQUAL = 49 | ||||
| AT = 50 | ||||
| OP = 51 | ||||
| COMMENT = 52 | ||||
| NL = 53 | ||||
| RARROW = 54 | ||||
| ERRORTOKEN = 55 | ||||
| N_TOKENS = 56 | ||||
| NT_OFFSET = 256 | ||||
| #--end constants-- | ||||
|  | ||||
| tok_name = {} | ||||
| for _name, _value in globals().items(): | ||||
|     if type(_value) is type(0): | ||||
|         tok_name[_value] = _name | ||||
|  | ||||
|  | ||||
| def ISTERMINAL(x): | ||||
|     return x < NT_OFFSET | ||||
|  | ||||
| def ISNONTERMINAL(x): | ||||
|     return x >= NT_OFFSET | ||||
|  | ||||
| def ISEOF(x): | ||||
|     return x == ENDMARKER | ||||
| @@ -0,0 +1,500 @@ | ||||
| # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. | ||||
| # All rights reserved. | ||||
|  | ||||
| """Tokenization help for Python programs. | ||||
|  | ||||
| generate_tokens(readline) is a generator that breaks a stream of | ||||
| text into Python tokens.  It accepts a readline-like method which is called | ||||
| repeatedly to get the next line of input (or "" for EOF).  It generates | ||||
| 5-tuples with these members: | ||||
|  | ||||
|     the token type (see token.py) | ||||
|     the token (a string) | ||||
|     the starting (row, column) indices of the token (a 2-tuple of ints) | ||||
|     the ending (row, column) indices of the token (a 2-tuple of ints) | ||||
|     the original line (string) | ||||
|  | ||||
| It is designed to match the working of the Python tokenizer exactly, except | ||||
| that it produces COMMENT tokens for comments and gives type OP for all | ||||
| operators | ||||
|  | ||||
| Older entry points | ||||
|     tokenize_loop(readline, tokeneater) | ||||
|     tokenize(readline, tokeneater=printtoken) | ||||
| are the same, except instead of generating tokens, tokeneater is a callback | ||||
| function to which the 5 fields described above are passed as 5 arguments, | ||||
| each time a new token is found.""" | ||||
|  | ||||
| __author__ = 'Ka-Ping Yee <ping@lfw.org>' | ||||
| __credits__ = \ | ||||
|     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' | ||||
|  | ||||
| import string, re | ||||
| from codecs import BOM_UTF8, lookup | ||||
| from lib2to3.pgen2.token import * | ||||
|  | ||||
| from . import token | ||||
| __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", | ||||
|            "generate_tokens", "untokenize"] | ||||
| del token | ||||
|  | ||||
| try: | ||||
|     bytes | ||||
| except NameError: | ||||
|     # Support bytes type in Python <= 2.5, so 2to3 turns itself into | ||||
|     # valid Python 3 code. | ||||
|     bytes = str | ||||
|  | ||||
| def group(*choices): return '(' + '|'.join(choices) + ')' | ||||
| def any(*choices): return group(*choices) + '*' | ||||
| def maybe(*choices): return group(*choices) + '?' | ||||
|  | ||||
| Whitespace = r'[ \f\t]*' | ||||
| Comment = r'#[^\r\n]*' | ||||
| Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) | ||||
| Name = r'[a-zA-Z_]\w*' | ||||
|  | ||||
| Binnumber = r'0[bB][01]*' | ||||
| Hexnumber = r'0[xX][\da-fA-F]*[lL]?' | ||||
| Octnumber = r'0[oO]?[0-7]*[lL]?' | ||||
| Decnumber = r'[1-9]\d*[lL]?' | ||||
| Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) | ||||
| Exponent = r'[eE][-+]?\d+' | ||||
| Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) | ||||
| Expfloat = r'\d+' + Exponent | ||||
| Floatnumber = group(Pointfloat, Expfloat) | ||||
| Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') | ||||
| Number = group(Imagnumber, Floatnumber, Intnumber) | ||||
|  | ||||
| # Tail end of ' string. | ||||
| Single = r"[^'\\]*(?:\\.[^'\\]*)*'" | ||||
| # Tail end of " string. | ||||
| Double = r'[^"\\]*(?:\\.[^"\\]*)*"' | ||||
| # Tail end of ''' string. | ||||
| Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" | ||||
| # Tail end of """ string. | ||||
| Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' | ||||
| Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""') | ||||
| # Single-line ' or " string. | ||||
| String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", | ||||
|                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') | ||||
|  | ||||
| # Because of leftmost-then-longest match semantics, be sure to put the | ||||
| # longest operators first (e.g., if = came before ==, == would get | ||||
| # recognized as two instances of =). | ||||
| Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", | ||||
|                  r"//=?", r"->", | ||||
|                  r"[+\-*/%&|^=<>]=?", | ||||
|                  r"~") | ||||
|  | ||||
| Bracket = '[][(){}]' | ||||
| Special = group(r'\r?\n', r'[:;.,`@]') | ||||
| Funny = group(Operator, Bracket, Special) | ||||
|  | ||||
| PlainToken = group(Number, Funny, String, Name) | ||||
| Token = Ignore + PlainToken | ||||
|  | ||||
| # First (or only) line of ' or " string. | ||||
| ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + | ||||
|                 group("'", r'\\\r?\n'), | ||||
|                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + | ||||
|                 group('"', r'\\\r?\n')) | ||||
| PseudoExtras = group(r'\\\r?\n', Comment, Triple) | ||||
| PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) | ||||
|  | ||||
| tokenprog, pseudoprog, single3prog, double3prog = map( | ||||
|     re.compile, (Token, PseudoToken, Single3, Double3)) | ||||
| endprogs = {"'": re.compile(Single), '"': re.compile(Double), | ||||
|             "'''": single3prog, '"""': double3prog, | ||||
|             "r'''": single3prog, 'r"""': double3prog, | ||||
|             "u'''": single3prog, 'u"""': double3prog, | ||||
|             "b'''": single3prog, 'b"""': double3prog, | ||||
|             "ur'''": single3prog, 'ur"""': double3prog, | ||||
|             "br'''": single3prog, 'br"""': double3prog, | ||||
|             "R'''": single3prog, 'R"""': double3prog, | ||||
|             "U'''": single3prog, 'U"""': double3prog, | ||||
|             "B'''": single3prog, 'B"""': double3prog, | ||||
|             "uR'''": single3prog, 'uR"""': double3prog, | ||||
|             "Ur'''": single3prog, 'Ur"""': double3prog, | ||||
|             "UR'''": single3prog, 'UR"""': double3prog, | ||||
|             "bR'''": single3prog, 'bR"""': double3prog, | ||||
|             "Br'''": single3prog, 'Br"""': double3prog, | ||||
|             "BR'''": single3prog, 'BR"""': double3prog, | ||||
|             'r': None, 'R': None, | ||||
|             'u': None, 'U': None, | ||||
|             'b': None, 'B': None} | ||||
|  | ||||
| triple_quoted = {} | ||||
| for t in ("'''", '"""', | ||||
|           "r'''", 'r"""', "R'''", 'R"""', | ||||
|           "u'''", 'u"""', "U'''", 'U"""', | ||||
|           "b'''", 'b"""', "B'''", 'B"""', | ||||
|           "ur'''", 'ur"""', "Ur'''", 'Ur"""', | ||||
|           "uR'''", 'uR"""', "UR'''", 'UR"""', | ||||
|           "br'''", 'br"""', "Br'''", 'Br"""', | ||||
|           "bR'''", 'bR"""', "BR'''", 'BR"""',): | ||||
|     triple_quoted[t] = t | ||||
| single_quoted = {} | ||||
| for t in ("'", '"', | ||||
|           "r'", 'r"', "R'", 'R"', | ||||
|           "u'", 'u"', "U'", 'U"', | ||||
|           "b'", 'b"', "B'", 'B"', | ||||
|           "ur'", 'ur"', "Ur'", 'Ur"', | ||||
|           "uR'", 'uR"', "UR'", 'UR"', | ||||
|           "br'", 'br"', "Br'", 'Br"', | ||||
|           "bR'", 'bR"', "BR'", 'BR"', ): | ||||
|     single_quoted[t] = t | ||||
|  | ||||
| tabsize = 8 | ||||
|  | ||||
| class TokenError(Exception): pass | ||||
|  | ||||
| class StopTokenizing(Exception): pass | ||||
|  | ||||
| def printtoken(type, token, start, end, line): # for testing | ||||
|     (srow, scol) = start | ||||
|     (erow, ecol) = end | ||||
|     print "%d,%d-%d,%d:\t%s\t%s" % \ | ||||
|         (srow, scol, erow, ecol, tok_name[type], repr(token)) | ||||
|  | ||||
| def tokenize(readline, tokeneater=printtoken): | ||||
|     """ | ||||
|     The tokenize() function accepts two parameters: one representing the | ||||
|     input stream, and one providing an output mechanism for tokenize(). | ||||
|  | ||||
|     The first parameter, readline, must be a callable object which provides | ||||
|     the same interface as the readline() method of built-in file objects. | ||||
|     Each call to the function should return one line of input as a string. | ||||
|  | ||||
|     The second parameter, tokeneater, must also be a callable object. It is | ||||
|     called once for each token, with five arguments, corresponding to the | ||||
|     tuples generated by generate_tokens(). | ||||
|     """ | ||||
|     try: | ||||
|         tokenize_loop(readline, tokeneater) | ||||
|     except StopTokenizing: | ||||
|         pass | ||||
|  | ||||
| # backwards compatible interface | ||||
| def tokenize_loop(readline, tokeneater): | ||||
|     for token_info in generate_tokens(readline): | ||||
|         tokeneater(*token_info) | ||||
|  | ||||
| class Untokenizer: | ||||
|  | ||||
|     def __init__(self): | ||||
|         self.tokens = [] | ||||
|         self.prev_row = 1 | ||||
|         self.prev_col = 0 | ||||
|  | ||||
|     def add_whitespace(self, start): | ||||
|         row, col = start | ||||
|         assert row <= self.prev_row | ||||
|         col_offset = col - self.prev_col | ||||
|         if col_offset: | ||||
|             self.tokens.append(" " * col_offset) | ||||
|  | ||||
|     def untokenize(self, iterable): | ||||
|         for t in iterable: | ||||
|             if len(t) == 2: | ||||
|                 self.compat(t, iterable) | ||||
|                 break | ||||
|             tok_type, token, start, end, line = t | ||||
|             self.add_whitespace(start) | ||||
|             self.tokens.append(token) | ||||
|             self.prev_row, self.prev_col = end | ||||
|             if tok_type in (NEWLINE, NL): | ||||
|                 self.prev_row += 1 | ||||
|                 self.prev_col = 0 | ||||
|         return "".join(self.tokens) | ||||
|  | ||||
|     def compat(self, token, iterable): | ||||
|         startline = False | ||||
|         indents = [] | ||||
|         toks_append = self.tokens.append | ||||
|         toknum, tokval = token | ||||
|         if toknum in (NAME, NUMBER): | ||||
|             tokval += ' ' | ||||
|         if toknum in (NEWLINE, NL): | ||||
|             startline = True | ||||
|         for tok in iterable: | ||||
|             toknum, tokval = tok[:2] | ||||
|  | ||||
|             if toknum in (NAME, NUMBER): | ||||
|                 tokval += ' ' | ||||
|  | ||||
|             if toknum == INDENT: | ||||
|                 indents.append(tokval) | ||||
|                 continue | ||||
|             elif toknum == DEDENT: | ||||
|                 indents.pop() | ||||
|                 continue | ||||
|             elif toknum in (NEWLINE, NL): | ||||
|                 startline = True | ||||
|             elif startline and indents: | ||||
|                 toks_append(indents[-1]) | ||||
|                 startline = False | ||||
|             toks_append(tokval) | ||||
|  | ||||
| cookie_re = re.compile("coding[:=]\s*([-\w.]+)") | ||||
|  | ||||
| def _get_normal_name(orig_enc): | ||||
|     """Imitates get_normal_name in tokenizer.c.""" | ||||
|     # Only care about the first 12 characters. | ||||
|     enc = orig_enc[:12].lower().replace("_", "-") | ||||
|     if enc == "utf-8" or enc.startswith("utf-8-"): | ||||
|         return "utf-8" | ||||
|     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ | ||||
|        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): | ||||
|         return "iso-8859-1" | ||||
|     return orig_enc | ||||
|  | ||||
| def detect_encoding(readline): | ||||
|     """ | ||||
|     The detect_encoding() function is used to detect the encoding that should | ||||
|     be used to decode a Python source file. It requires one argment, readline, | ||||
|     in the same way as the tokenize() generator. | ||||
|  | ||||
|     It will call readline a maximum of twice, and return the encoding used | ||||
|     (as a string) and a list of any lines (left as bytes) it has read | ||||
|     in. | ||||
|  | ||||
|     It detects the encoding from the presence of a utf-8 bom or an encoding | ||||
|     cookie as specified in pep-0263. If both a bom and a cookie are present, but | ||||
|     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid | ||||
|     charset, raise a SyntaxError.  Note that if a utf-8 bom is found, | ||||
|     'utf-8-sig' is returned. | ||||
|  | ||||
|     If no encoding is specified, then the default of 'utf-8' will be returned. | ||||
|     """ | ||||
|     bom_found = False | ||||
|     encoding = None | ||||
|     default = 'utf-8' | ||||
|     def read_or_stop(): | ||||
|         try: | ||||
|             return readline() | ||||
|         except StopIteration: | ||||
|             return bytes() | ||||
|  | ||||
|     def find_cookie(line): | ||||
|         try: | ||||
|             line_string = line.decode('ascii') | ||||
|         except UnicodeDecodeError: | ||||
|             return None | ||||
|  | ||||
|         matches = cookie_re.findall(line_string) | ||||
|         if not matches: | ||||
|             return None | ||||
|         encoding = _get_normal_name(matches[0]) | ||||
|         try: | ||||
|             codec = lookup(encoding) | ||||
|         except LookupError: | ||||
|             # This behaviour mimics the Python interpreter | ||||
|             raise SyntaxError("unknown encoding: " + encoding) | ||||
|  | ||||
|         if bom_found: | ||||
|             if codec.name != 'utf-8': | ||||
|                 # This behaviour mimics the Python interpreter | ||||
|                 raise SyntaxError('encoding problem: utf-8') | ||||
|             encoding += '-sig' | ||||
|         return encoding | ||||
|  | ||||
|     first = read_or_stop() | ||||
|     if first.startswith(BOM_UTF8): | ||||
|         bom_found = True | ||||
|         first = first[3:] | ||||
|         default = 'utf-8-sig' | ||||
|     if not first: | ||||
|         return default, [] | ||||
|  | ||||
|     encoding = find_cookie(first) | ||||
|     if encoding: | ||||
|         return encoding, [first] | ||||
|  | ||||
|     second = read_or_stop() | ||||
|     if not second: | ||||
|         return default, [first] | ||||
|  | ||||
|     encoding = find_cookie(second) | ||||
|     if encoding: | ||||
|         return encoding, [first, second] | ||||
|  | ||||
|     return default, [first, second] | ||||
|  | ||||
| def untokenize(iterable): | ||||
|     """Transform tokens back into Python source code. | ||||
|  | ||||
|     Each element returned by the iterable must be a token sequence | ||||
|     with at least two elements, a token number and token value.  If | ||||
|     only two tokens are passed, the resulting output is poor. | ||||
|  | ||||
|     Round-trip invariant for full input: | ||||
|         Untokenized source will match input source exactly | ||||
|  | ||||
|     Round-trip invariant for limited intput: | ||||
|         # Output text will tokenize the back to the input | ||||
|         t1 = [tok[:2] for tok in generate_tokens(f.readline)] | ||||
|         newcode = untokenize(t1) | ||||
|         readline = iter(newcode.splitlines(1)).next | ||||
|         t2 = [tok[:2] for tokin generate_tokens(readline)] | ||||
|         assert t1 == t2 | ||||
|     """ | ||||
|     ut = Untokenizer() | ||||
|     return ut.untokenize(iterable) | ||||
|  | ||||
| def generate_tokens(readline): | ||||
|     """ | ||||
|     The generate_tokens() generator requires one argment, readline, which | ||||
|     must be a callable object which provides the same interface as the | ||||
|     readline() method of built-in file objects. Each call to the function | ||||
|     should return one line of input as a string.  Alternately, readline | ||||
|     can be a callable function terminating with StopIteration: | ||||
|         readline = open(myfile).next    # Example of alternate readline | ||||
|  | ||||
|     The generator produces 5-tuples with these members: the token type; the | ||||
|     token string; a 2-tuple (srow, scol) of ints specifying the row and | ||||
|     column where the token begins in the source; a 2-tuple (erow, ecol) of | ||||
|     ints specifying the row and column where the token ends in the source; | ||||
|     and the line on which the token was found. The line passed is the | ||||
|     logical line; continuation lines are included. | ||||
|     """ | ||||
|     lnum = parenlev = continued = 0 | ||||
|     namechars, numchars = string.ascii_letters + '_', '0123456789' | ||||
|     contstr, needcont = '', 0 | ||||
|     contline = None | ||||
|     indents = [0] | ||||
|  | ||||
|     while 1:                                   # loop over lines in stream | ||||
|         try: | ||||
|             line = readline() | ||||
|         except StopIteration: | ||||
|             line = '' | ||||
|         lnum = lnum + 1 | ||||
|         pos, max = 0, len(line) | ||||
|  | ||||
|         if contstr:                            # continued string | ||||
|             if not line: | ||||
|                 raise TokenError, ("EOF in multi-line string", strstart) | ||||
|             endmatch = endprog.match(line) | ||||
|             if endmatch: | ||||
|                 pos = end = endmatch.end(0) | ||||
|                 yield (STRING, contstr + line[:end], | ||||
|                        strstart, (lnum, end), contline + line) | ||||
|                 contstr, needcont = '', 0 | ||||
|                 contline = None | ||||
|             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': | ||||
|                 yield (ERRORTOKEN, contstr + line, | ||||
|                            strstart, (lnum, len(line)), contline) | ||||
|                 contstr = '' | ||||
|                 contline = None | ||||
|                 continue | ||||
|             else: | ||||
|                 contstr = contstr + line | ||||
|                 contline = contline + line | ||||
|                 continue | ||||
|  | ||||
|         elif parenlev == 0 and not continued:  # new statement | ||||
|             if not line: break | ||||
|             column = 0 | ||||
|             while pos < max:                   # measure leading whitespace | ||||
|                 if line[pos] == ' ': column = column + 1 | ||||
|                 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize | ||||
|                 elif line[pos] == '\f': column = 0 | ||||
|                 else: break | ||||
|                 pos = pos + 1 | ||||
|             if pos == max: break | ||||
|  | ||||
|             if line[pos] in '#\r\n':           # skip comments or blank lines | ||||
|                 if line[pos] == '#': | ||||
|                     comment_token = line[pos:].rstrip('\r\n') | ||||
|                     nl_pos = pos + len(comment_token) | ||||
|                     yield (COMMENT, comment_token, | ||||
|                            (lnum, pos), (lnum, pos + len(comment_token)), line) | ||||
|                     yield (NL, line[nl_pos:], | ||||
|                            (lnum, nl_pos), (lnum, len(line)), line) | ||||
|                 else: | ||||
|                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], | ||||
|                            (lnum, pos), (lnum, len(line)), line) | ||||
|                 continue | ||||
|  | ||||
|             if column > indents[-1]:           # count indents or dedents | ||||
|                 indents.append(column) | ||||
|                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) | ||||
|             while column < indents[-1]: | ||||
|                 if column not in indents: | ||||
|                     raise IndentationError( | ||||
|                         "unindent does not match any outer indentation level", | ||||
|                         ("<tokenize>", lnum, pos, line)) | ||||
|                 indents = indents[:-1] | ||||
|                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) | ||||
|  | ||||
|         else:                                  # continued statement | ||||
|             if not line: | ||||
|                 raise TokenError, ("EOF in multi-line statement", (lnum, 0)) | ||||
|             continued = 0 | ||||
|  | ||||
|         while pos < max: | ||||
|             pseudomatch = pseudoprog.match(line, pos) | ||||
|             if pseudomatch:                                # scan for tokens | ||||
|                 start, end = pseudomatch.span(1) | ||||
|                 spos, epos, pos = (lnum, start), (lnum, end), end | ||||
|                 token, initial = line[start:end], line[start] | ||||
|  | ||||
|                 if initial in numchars or \ | ||||
|                    (initial == '.' and token != '.'):      # ordinary number | ||||
|                     yield (NUMBER, token, spos, epos, line) | ||||
|                 elif initial in '\r\n': | ||||
|                     newline = NEWLINE | ||||
|                     if parenlev > 0: | ||||
|                         newline = NL | ||||
|                     yield (newline, token, spos, epos, line) | ||||
|                 elif initial == '#': | ||||
|                     assert not token.endswith("\n") | ||||
|                     yield (COMMENT, token, spos, epos, line) | ||||
|                 elif token in triple_quoted: | ||||
|                     endprog = endprogs[token] | ||||
|                     endmatch = endprog.match(line, pos) | ||||
|                     if endmatch:                           # all on one line | ||||
|                         pos = endmatch.end(0) | ||||
|                         token = line[start:pos] | ||||
|                         yield (STRING, token, spos, (lnum, pos), line) | ||||
|                     else: | ||||
|                         strstart = (lnum, start)           # multiple lines | ||||
|                         contstr = line[start:] | ||||
|                         contline = line | ||||
|                         break | ||||
|                 elif initial in single_quoted or \ | ||||
|                     token[:2] in single_quoted or \ | ||||
|                     token[:3] in single_quoted: | ||||
|                     if token[-1] == '\n':                  # continued string | ||||
|                         strstart = (lnum, start) | ||||
|                         endprog = (endprogs[initial] or endprogs[token[1]] or | ||||
|                                    endprogs[token[2]]) | ||||
|                         contstr, needcont = line[start:], 1 | ||||
|                         contline = line | ||||
|                         break | ||||
|                     else:                                  # ordinary string | ||||
|                         yield (STRING, token, spos, epos, line) | ||||
|                 elif initial in namechars:                 # ordinary name | ||||
|                     yield (NAME, token, spos, epos, line) | ||||
|                 elif initial == '\\':                      # continued stmt | ||||
|                     # This yield is new; needed for better idempotency: | ||||
|                     yield (NL, token, spos, (lnum, pos), line) | ||||
|                     continued = 1 | ||||
|                 else: | ||||
|                     if initial in '([{': parenlev = parenlev + 1 | ||||
|                     elif initial in ')]}': parenlev = parenlev - 1 | ||||
|                     yield (OP, token, spos, epos, line) | ||||
|             else: | ||||
|                 yield (ERRORTOKEN, line[pos], | ||||
|                            (lnum, pos), (lnum, pos+1), line) | ||||
|                 pos = pos + 1 | ||||
|  | ||||
|     for indent in indents[1:]:                 # pop remaining indent levels | ||||
|         yield (DEDENT, '', (lnum, 0), (lnum, 0), '') | ||||
|     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') | ||||
|  | ||||
| if __name__ == '__main__':                     # testing | ||||
|     import sys | ||||
|     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) | ||||
|     else: tokenize(sys.stdin.readline) | ||||
		Reference in New Issue
	
	Block a user