Package dns :: Module tokenizer
[hide private]
[frames] | no frames]

Source Code for Module dns.tokenizer

  1  # Copyright (C) 2003-2007, 2009 Nominum, Inc. 
  2  # 
  3  # Permission to use, copy, modify, and distribute this software and its 
  4  # documentation for any purpose with or without fee is hereby granted, 
  5  # provided that the above copyright notice and this permission notice 
  6  # appear in all copies. 
  7  # 
  8  # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES 
  9  # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 
 10  # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR 
 11  # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 
 12  # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
 13  # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 
 14  # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
 15   
 16  """Tokenize DNS master file format""" 
 17   
 18  import cStringIO 
 19  import sys 
 20   
 21  import dns.exception 
 22  import dns.name 
 23  import dns.ttl 
 24   
 25  _DELIMITERS = { 
 26      ' ' : True, 
 27      '\t' : True, 
 28      '\n' : True, 
 29      ';' : True, 
 30      '(' : True, 
 31      ')' : True, 
 32      '"' : True } 
 33   
 34  _QUOTING_DELIMITERS = { '"' : True } 
 35   
 36  EOF = 0 
 37  EOL = 1 
 38  WHITESPACE = 2 
 39  IDENTIFIER = 3 
 40  QUOTED_STRING = 4 
 41  COMMENT = 5 
 42  DELIMITER = 6 
 43   
44 -class UngetBufferFull(dns.exception.DNSException):
45 """Raised when an attempt is made to unget a token when the unget 46 buffer is full.""" 47 pass
48
49 -class Tokenizer(object):
50 """A DNS master file format tokenizer. 51 52 A token is a (type, value) tuple, where I{type} is an int, and 53 I{value} is a string. The valid types are EOF, EOL, WHITESPACE, 54 IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER. 55 56 @ivar file: The file to tokenize 57 @type file: file 58 @ivar ungotten_char: The most recently ungotten character, or None. 59 @type ungotten_char: string 60 @ivar ungotten_token: The most recently ungotten token, or None. 61 @type ungotten_token: (int, string) token tuple 62 @ivar multiline: The current multiline level. This value is increased 63 by one every time a '(' delimiter is read, and decreased by one every time 64 a ')' delimiter is read. 65 @type multiline: int 66 @ivar quoting: This variable is true if the tokenizer is currently 67 reading a quoted string. 68 @type quoting: bool 69 @ivar eof: This variable is true if the tokenizer has encountered EOF. 70 @type eof: bool 71 @ivar delimiters: The current delimiter dictionary. 72 @type delimiters: dict 73 @ivar line_number: The current line number 74 @type line_number: int 75 @ivar filename: A filename that will be returned by the L{where} method. 76 @type filename: string 77 """ 78
79 - def __init__(self, f=sys.stdin, filename=None):
80 """Initialize a tokenizer instance. 81 82 @param f: The file to tokenize. The default is sys.stdin. 83 This parameter may also be a string, in which case the tokenizer 84 will take its input from the contents of the string. 85 @type f: file or string 86 @param filename: the name of the filename that the L{where} method 87 will return. 88 @type filename: string 89 """ 90 91 if isinstance(f, str): 92 f = cStringIO.StringIO(f) 93 if filename is None: 94 filename = '<string>' 95 else: 96 if filename is None: 97 if f is sys.stdin: 98 filename = '<stdin>' 99 else: 100 filename = '<file>' 101 self.file = f 102 self.ungotten_char = None 103 self.ungotten_token = None 104 self.multiline = 0 105 self.quoting = False 106 self.eof = False 107 self.delimiters = _DELIMITERS 108 self.line_number = 1 109 self.filename = filename
110
111 - def _get_char(self):
112 """Read a character from input. 113 @rtype: string 114 """ 115 116 if self.ungotten_char is None: 117 if self.eof: 118 c = '' 119 else: 120 c = self.file.read(1) 121 if c == '': 122 self.eof = True 123 elif c == '\n': 124 self.line_number += 1 125 else: 126 c = self.ungotten_char 127 self.ungotten_char = None 128 return c
129
130 - def where(self):
131 """Return the current location in the input. 132 133 @rtype: (string, int) tuple. The first item is the filename of 134 the input, the second is the current line number. 135 """ 136 137 return (self.filename, self.line_number)
138
139 - def _unget_char(self, c):
140 """Unget a character. 141 142 The unget buffer for characters is only one character large; it is 143 an error to try to unget a character when the unget buffer is not 144 empty. 145 146 @param c: the character to unget 147 @type c: string 148 @raises UngetBufferFull: there is already an ungotten char 149 """ 150 151 if not self.ungotten_char is None: 152 raise UngetBufferFull 153 self.ungotten_char = c
154
155 - def skip_whitespace(self):
156 """Consume input until a non-whitespace character is encountered. 157 158 The non-whitespace character is then ungotten, and the number of 159 whitespace characters consumed is returned. 160 161 If the tokenizer is in multiline mode, then newlines are whitespace. 162 163 @rtype: int 164 """ 165 166 skipped = 0 167 while True: 168 c = self._get_char() 169 if c != ' ' and c != '\t': 170 if (c != '\n') or not self.multiline: 171 self._unget_char(c) 172 return skipped 173 skipped += 1
174
175 - def get(self, want_leading = False, want_comment = False):
176 """Get the next token. 177 178 @param want_leading: If True, return a WHITESPACE token if the 179 first character read is whitespace. The default is False. 180 @type want_leading: bool 181 @param want_comment: If True, return a COMMENT token if the 182 first token read is a comment. The default is False. 183 @type want_comment: bool 184 @rtype: (int, string) tuple 185 @raises dns.exception.UnexpectedEnd: input ended prematurely 186 @raises dns.exception.SyntaxError: input was badly formed 187 """ 188 189 if not self.ungotten_token is None: 190 token = self.ungotten_token 191 self.ungotten_token = None 192 if token[0] == WHITESPACE: 193 if want_leading: 194 return token 195 elif token[0] == COMMENT: 196 if want_comment: 197 return token 198 else: 199 return token 200 skipped = self.skip_whitespace() 201 if want_leading and skipped > 0: 202 return (WHITESPACE, ' ') 203 token = '' 204 ttype = IDENTIFIER 205 while True: 206 c = self._get_char() 207 if c == '' or c in self.delimiters: 208 if c == '' and self.quoting: 209 raise dns.exception.UnexpectedEnd 210 if token == '' and ttype != QUOTED_STRING: 211 if c == '(': 212 self.multiline += 1 213 self.skip_whitespace() 214 continue 215 elif c == ')': 216 if not self.multiline > 0: 217 raise dns.exception.SyntaxError 218 self.multiline -= 1 219 self.skip_whitespace() 220 continue 221 elif c == '"': 222 if not self.quoting: 223 self.quoting = True 224 self.delimiters = _QUOTING_DELIMITERS 225 ttype = QUOTED_STRING 226 continue 227 else: 228 self.quoting = False 229 self.delimiters = _DELIMITERS 230 self.skip_whitespace() 231 continue 232 elif c == '\n': 233 return (EOL, '\n') 234 elif c == ';': 235 while 1: 236 c = self._get_char() 237 if c == '\n' or c == '': 238 break 239 token += c 240 if want_comment: 241 self._unget_char(c) 242 return (COMMENT, token) 243 elif c == '': 244 if self.multiline: 245 raise dns.exception.SyntaxError, \ 246 'unbalanced parentheses' 247 return (EOF, '') 248 elif self.multiline: 249 self.skip_whitespace() 250 token = '' 251 continue 252 else: 253 return (EOL, '\n') 254 else: 255 # This code exists in case we ever want a 256 # delimiter to be returned. It never produces 257 # a token currently. 258 token = c 259 ttype = DELIMITER 260 else: 261 self._unget_char(c) 262 break 263 elif self.quoting: 264 if c == '\\': 265 c = self._get_char() 266 if c == '': 267 raise dns.exception.UnexpectedEnd 268 if c.isdigit(): 269 c2 = self._get_char() 270 if c2 == '': 271 raise dns.exception.UnexpectedEnd 272 c3 = self._get_char() 273 if c == '': 274 raise dns.exception.UnexpectedEnd 275 if not (c2.isdigit() and c3.isdigit()): 276 raise dns.exception.SyntaxError 277 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 278 elif c == '\n': 279 raise dns.exception.SyntaxError, 'newline in quoted string' 280 elif c == '\\': 281 # 282 # Treat \ followed by a delimiter as the 283 # delimiter, otherwise leave it alone. 284 # 285 c = self._get_char() 286 if c == '' or not c in self.delimiters: 287 self._unget_char(c) 288 c = '\\' 289 token += c 290 if token == '' and ttype != QUOTED_STRING: 291 if self.multiline: 292 raise dns.exception.SyntaxError, 'unbalanced parentheses' 293 ttype = EOF 294 return (ttype, token)
295
296 - def unget(self, token):
297 """Unget a token. 298 299 The unget buffer for tokens is only one token large; it is 300 an error to try to unget a token when the unget buffer is not 301 empty. 302 303 @param token: the token to unget 304 @type token: (int, string) token tuple 305 @raises UngetBufferFull: there is already an ungotten token 306 """ 307 308 if not self.ungotten_token is None: 309 raise UngetBufferFull 310 self.ungotten_token = token
311
312 - def next(self):
313 """Return the next item in an iteration. 314 @rtype: (int, string) 315 """ 316 317 token = self.get() 318 if token[0] == EOF: 319 raise StopIteration 320 return token
321
322 - def __iter__(self):
323 return self
324 325 # Helpers 326
327 - def get_int(self):
328 """Read the next token and interpret it as an integer. 329 330 @raises dns.exception.SyntaxError: 331 @rtype: int 332 """ 333 334 (ttype, value) = self.get() 335 if ttype != IDENTIFIER: 336 raise dns.exception.SyntaxError, 'expecting an identifier' 337 if not value.isdigit(): 338 raise dns.exception.SyntaxError, 'expecting an integer' 339 return int(value)
340
341 - def get_uint8(self):
342 """Read the next token and interpret it as an 8-bit unsigned 343 integer. 344 345 @raises dns.exception.SyntaxError: 346 @rtype: int 347 """ 348 349 value = self.get_int() 350 if value < 0 or value > 255: 351 raise dns.exception.SyntaxError, \ 352 '%d is not an unsigned 8-bit integer' % value 353 return value
354
355 - def get_uint16(self):
356 """Read the next token and interpret it as a 16-bit unsigned 357 integer. 358 359 @raises dns.exception.SyntaxError: 360 @rtype: int 361 """ 362 363 value = self.get_int() 364 if value < 0 or value > 65535: 365 raise dns.exception.SyntaxError, \ 366 '%d is not an unsigned 16-bit integer' % value 367 return value
368
369 - def get_uint32(self):
370 """Read the next token and interpret it as a 32-bit unsigned 371 integer. 372 373 @raises dns.exception.SyntaxError: 374 @rtype: int 375 """ 376 377 (ttype, value) = self.get() 378 if ttype != IDENTIFIER: 379 raise dns.exception.SyntaxError, 'expecting an identifier' 380 if not value.isdigit(): 381 raise dns.exception.SyntaxError, 'expecting an integer' 382 value = long(value) 383 if value < 0 or value > 4294967296L: 384 raise dns.exception.SyntaxError, \ 385 '%d is not an unsigned 32-bit integer' % value 386 return value
387
388 - def get_string(self, origin=None):
389 """Read the next token and interpret it as a string. 390 391 @raises dns.exception.SyntaxError: 392 @rtype: string 393 """ 394 395 (ttype, t) = self.get() 396 if ttype != IDENTIFIER and ttype != QUOTED_STRING: 397 raise dns.exception.SyntaxError, 'expecting a string' 398 return t
399
400 - def get_name(self, origin=None):
401 """Read the next token and interpret it as a DNS name. 402 403 @raises dns.exception.SyntaxError: 404 @rtype: dns.name.Name object""" 405 406 (ttype, t) = self.get() 407 if ttype != IDENTIFIER: 408 raise dns.exception.SyntaxError, 'expecting an identifier' 409 return dns.name.from_text(t, origin)
410
411 - def get_eol(self):
412 """Read the next token and raise an exception if it isn't EOL or 413 EOF. 414 415 @raises dns.exception.SyntaxError: 416 @rtype: string 417 """ 418 419 (ttype, t) = self.get() 420 if ttype != EOL and ttype != EOF: 421 raise dns.exception.SyntaxError, \ 422 'expected EOL or EOF, got %d "%s"' % (ttype, t) 423 return t
424
425 - def get_ttl(self):
426 (ttype, t) = self.get() 427 if ttype != IDENTIFIER: 428 raise dns.exception.SyntaxError, 'expecting an identifier' 429 return dns.ttl.from_text(t)
430