Package dns :: Module tokenizer
[hide private]
[frames] | no frames]

Source Code for Module dns.tokenizer

  1  # Copyright (C) 2003-2007, 2009-2011 Nominum, Inc. 
  2  # 
  3  # Permission to use, copy, modify, and distribute this software and its 
  4  # documentation for any purpose with or without fee is hereby granted, 
  5  # provided that the above copyright notice and this permission notice 
  6  # appear in all copies. 
  7  # 
  8  # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES 
  9  # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 
 10  # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR 
 11  # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 
 12  # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
 13  # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 
 14  # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
 15   
 16  """Tokenize DNS master file format""" 
 17   
 18  from io import StringIO 
 19  import sys 
 20   
 21  import dns.exception 
 22  import dns.name 
 23  import dns.ttl 
 24  from ._compat import long, text_type, binary_type 
 25   
 26  _DELIMITERS = { 
 27      ' ': True, 
 28      '\t': True, 
 29      '\n': True, 
 30      ';': True, 
 31      '(': True, 
 32      ')': True, 
 33      '"': True} 
 34   
 35  _QUOTING_DELIMITERS = {'"': True} 
 36   
 37  EOF = 0 
 38  EOL = 1 
 39  WHITESPACE = 2 
 40  IDENTIFIER = 3 
 41  QUOTED_STRING = 4 
 42  COMMENT = 5 
 43  DELIMITER = 6 
 44   
 45   
46 -class UngetBufferFull(dns.exception.DNSException):
47 48 """An attempt was made to unget a token when the unget buffer was full."""
49 50
51 -class Token(object):
52 53 """A DNS master file format token. 54 55 @ivar ttype: The token type 56 @type ttype: int 57 @ivar value: The token value 58 @type value: string 59 @ivar has_escape: Does the token value contain escapes? 60 @type has_escape: bool 61 """ 62
63 - def __init__(self, ttype, value='', has_escape=False):
64 """Initialize a token instance. 65 66 @param ttype: The token type 67 @type ttype: int 68 @param value: The token value 69 @type value: string 70 @param has_escape: Does the token value contain escapes? 71 @type has_escape: bool 72 """ 73 self.ttype = ttype 74 self.value = value 75 self.has_escape = has_escape
76
77 - def is_eof(self):
78 return self.ttype == EOF
79
80 - def is_eol(self):
81 return self.ttype == EOL
82
83 - def is_whitespace(self):
84 return self.ttype == WHITESPACE
85
86 - def is_identifier(self):
87 return self.ttype == IDENTIFIER
88
89 - def is_quoted_string(self):
90 return self.ttype == QUOTED_STRING
91
92 - def is_comment(self):
93 return self.ttype == COMMENT
94
95 - def is_delimiter(self):
96 return self.ttype == DELIMITER
97
98 - def is_eol_or_eof(self):
99 return self.ttype == EOL or self.ttype == EOF
100
101 - def __eq__(self, other):
102 if not isinstance(other, Token): 103 return False 104 return (self.ttype == other.ttype and 105 self.value == other.value)
106
107 - def __ne__(self, other):
108 if not isinstance(other, Token): 109 return True 110 return (self.ttype != other.ttype or 111 self.value != other.value)
112
113 - def __str__(self):
114 return '%d "%s"' % (self.ttype, self.value)
115
116 - def unescape(self):
117 if not self.has_escape: 118 return self 119 unescaped = '' 120 l = len(self.value) 121 i = 0 122 while i < l: 123 c = self.value[i] 124 i += 1 125 if c == '\\': 126 if i >= l: 127 raise dns.exception.UnexpectedEnd 128 c = self.value[i] 129 i += 1 130 if c.isdigit(): 131 if i >= l: 132 raise dns.exception.UnexpectedEnd 133 c2 = self.value[i] 134 i += 1 135 if i >= l: 136 raise dns.exception.UnexpectedEnd 137 c3 = self.value[i] 138 i += 1 139 if not (c2.isdigit() and c3.isdigit()): 140 raise dns.exception.SyntaxError 141 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 142 unescaped += c 143 return Token(self.ttype, unescaped)
144 145 # compatibility for old-style tuple tokens 146
147 - def __len__(self):
148 return 2
149
150 - def __iter__(self):
151 return iter((self.ttype, self.value))
152
153 - def __getitem__(self, i):
154 if i == 0: 155 return self.ttype 156 elif i == 1: 157 return self.value 158 else: 159 raise IndexError
160 161
162 -class Tokenizer(object):
163 164 """A DNS master file format tokenizer. 165 166 A token is a (type, value) tuple, where I{type} is an int, and 167 I{value} is a string. The valid types are EOF, EOL, WHITESPACE, 168 IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER. 169 170 @ivar file: The file to tokenize 171 @type file: file 172 @ivar ungotten_char: The most recently ungotten character, or None. 173 @type ungotten_char: string 174 @ivar ungotten_token: The most recently ungotten token, or None. 175 @type ungotten_token: (int, string) token tuple 176 @ivar multiline: The current multiline level. This value is increased 177 by one every time a '(' delimiter is read, and decreased by one every time 178 a ')' delimiter is read. 179 @type multiline: int 180 @ivar quoting: This variable is true if the tokenizer is currently 181 reading a quoted string. 182 @type quoting: bool 183 @ivar eof: This variable is true if the tokenizer has encountered EOF. 184 @type eof: bool 185 @ivar delimiters: The current delimiter dictionary. 186 @type delimiters: dict 187 @ivar line_number: The current line number 188 @type line_number: int 189 @ivar filename: A filename that will be returned by the L{where} method. 190 @type filename: string 191 """ 192
193 - def __init__(self, f=sys.stdin, filename=None):
194 """Initialize a tokenizer instance. 195 196 @param f: The file to tokenize. The default is sys.stdin. 197 This parameter may also be a string, in which case the tokenizer 198 will take its input from the contents of the string. 199 @type f: file or string 200 @param filename: the name of the filename that the L{where} method 201 will return. 202 @type filename: string 203 """ 204 205 if isinstance(f, text_type): 206 f = StringIO(f) 207 if filename is None: 208 filename = '<string>' 209 elif isinstance(f, binary_type): 210 f = StringIO(f.decode()) 211 if filename is None: 212 filename = '<string>' 213 else: 214 if filename is None: 215 if f is sys.stdin: 216 filename = '<stdin>' 217 else: 218 filename = '<file>' 219 self.file = f 220 self.ungotten_char = None 221 self.ungotten_token = None 222 self.multiline = 0 223 self.quoting = False 224 self.eof = False 225 self.delimiters = _DELIMITERS 226 self.line_number = 1 227 self.filename = filename
228
229 - def _get_char(self):
230 """Read a character from input. 231 @rtype: string 232 """ 233 234 if self.ungotten_char is None: 235 if self.eof: 236 c = '' 237 else: 238 c = self.file.read(1) 239 if c == '': 240 self.eof = True 241 elif c == '\n': 242 self.line_number += 1 243 else: 244 c = self.ungotten_char 245 self.ungotten_char = None 246 return c
247
248 - def where(self):
249 """Return the current location in the input. 250 251 @rtype: (string, int) tuple. The first item is the filename of 252 the input, the second is the current line number. 253 """ 254 255 return (self.filename, self.line_number)
256
257 - def _unget_char(self, c):
258 """Unget a character. 259 260 The unget buffer for characters is only one character large; it is 261 an error to try to unget a character when the unget buffer is not 262 empty. 263 264 @param c: the character to unget 265 @type c: string 266 @raises UngetBufferFull: there is already an ungotten char 267 """ 268 269 if self.ungotten_char is not None: 270 raise UngetBufferFull 271 self.ungotten_char = c
272
273 - def skip_whitespace(self):
274 """Consume input until a non-whitespace character is encountered. 275 276 The non-whitespace character is then ungotten, and the number of 277 whitespace characters consumed is returned. 278 279 If the tokenizer is in multiline mode, then newlines are whitespace. 280 281 @rtype: int 282 """ 283 284 skipped = 0 285 while True: 286 c = self._get_char() 287 if c != ' ' and c != '\t': 288 if (c != '\n') or not self.multiline: 289 self._unget_char(c) 290 return skipped 291 skipped += 1
292
293 - def get(self, want_leading=False, want_comment=False):
294 """Get the next token. 295 296 @param want_leading: If True, return a WHITESPACE token if the 297 first character read is whitespace. The default is False. 298 @type want_leading: bool 299 @param want_comment: If True, return a COMMENT token if the 300 first token read is a comment. The default is False. 301 @type want_comment: bool 302 @rtype: Token object 303 @raises dns.exception.UnexpectedEnd: input ended prematurely 304 @raises dns.exception.SyntaxError: input was badly formed 305 """ 306 307 if self.ungotten_token is not None: 308 token = self.ungotten_token 309 self.ungotten_token = None 310 if token.is_whitespace(): 311 if want_leading: 312 return token 313 elif token.is_comment(): 314 if want_comment: 315 return token 316 else: 317 return token 318 skipped = self.skip_whitespace() 319 if want_leading and skipped > 0: 320 return Token(WHITESPACE, ' ') 321 token = '' 322 ttype = IDENTIFIER 323 has_escape = False 324 while True: 325 c = self._get_char() 326 if c == '' or c in self.delimiters: 327 if c == '' and self.quoting: 328 raise dns.exception.UnexpectedEnd 329 if token == '' and ttype != QUOTED_STRING: 330 if c == '(': 331 self.multiline += 1 332 self.skip_whitespace() 333 continue 334 elif c == ')': 335 if self.multiline <= 0: 336 raise dns.exception.SyntaxError 337 self.multiline -= 1 338 self.skip_whitespace() 339 continue 340 elif c == '"': 341 if not self.quoting: 342 self.quoting = True 343 self.delimiters = _QUOTING_DELIMITERS 344 ttype = QUOTED_STRING 345 continue 346 else: 347 self.quoting = False 348 self.delimiters = _DELIMITERS 349 self.skip_whitespace() 350 continue 351 elif c == '\n': 352 return Token(EOL, '\n') 353 elif c == ';': 354 while 1: 355 c = self._get_char() 356 if c == '\n' or c == '': 357 break 358 token += c 359 if want_comment: 360 self._unget_char(c) 361 return Token(COMMENT, token) 362 elif c == '': 363 if self.multiline: 364 raise dns.exception.SyntaxError( 365 'unbalanced parentheses') 366 return Token(EOF) 367 elif self.multiline: 368 self.skip_whitespace() 369 token = '' 370 continue 371 else: 372 return Token(EOL, '\n') 373 else: 374 # This code exists in case we ever want a 375 # delimiter to be returned. It never produces 376 # a token currently. 377 token = c 378 ttype = DELIMITER 379 else: 380 self._unget_char(c) 381 break 382 elif self.quoting: 383 if c == '\\': 384 c = self._get_char() 385 if c == '': 386 raise dns.exception.UnexpectedEnd 387 if c.isdigit(): 388 c2 = self._get_char() 389 if c2 == '': 390 raise dns.exception.UnexpectedEnd 391 c3 = self._get_char() 392 if c == '': 393 raise dns.exception.UnexpectedEnd 394 if not (c2.isdigit() and c3.isdigit()): 395 raise dns.exception.SyntaxError 396 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 397 elif c == '\n': 398 raise dns.exception.SyntaxError('newline in quoted string') 399 elif c == '\\': 400 # 401 # It's an escape. Put it and the next character into 402 # the token; it will be checked later for goodness. 403 # 404 token += c 405 has_escape = True 406 c = self._get_char() 407 if c == '' or c == '\n': 408 raise dns.exception.UnexpectedEnd 409 token += c 410 if token == '' and ttype != QUOTED_STRING: 411 if self.multiline: 412 raise dns.exception.SyntaxError('unbalanced parentheses') 413 ttype = EOF 414 return Token(ttype, token, has_escape)
415
416 - def unget(self, token):
417 """Unget a token. 418 419 The unget buffer for tokens is only one token large; it is 420 an error to try to unget a token when the unget buffer is not 421 empty. 422 423 @param token: the token to unget 424 @type token: Token object 425 @raises UngetBufferFull: there is already an ungotten token 426 """ 427 428 if self.ungotten_token is not None: 429 raise UngetBufferFull 430 self.ungotten_token = token
431
432 - def next(self):
433 """Return the next item in an iteration. 434 @rtype: (int, string) 435 """ 436 437 token = self.get() 438 if token.is_eof(): 439 raise StopIteration 440 return token
441 442 __next__ = next 443
444 - def __iter__(self):
445 return self
446 447 # Helpers 448
449 - def get_int(self):
450 """Read the next token and interpret it as an integer. 451 452 @raises dns.exception.SyntaxError: 453 @rtype: int 454 """ 455 456 token = self.get().unescape() 457 if not token.is_identifier(): 458 raise dns.exception.SyntaxError('expecting an identifier') 459 if not token.value.isdigit(): 460 raise dns.exception.SyntaxError('expecting an integer') 461 return int(token.value)
462
463 - def get_uint8(self):
464 """Read the next token and interpret it as an 8-bit unsigned 465 integer. 466 467 @raises dns.exception.SyntaxError: 468 @rtype: int 469 """ 470 471 value = self.get_int() 472 if value < 0 or value > 255: 473 raise dns.exception.SyntaxError( 474 '%d is not an unsigned 8-bit integer' % value) 475 return value
476
477 - def get_uint16(self):
478 """Read the next token and interpret it as a 16-bit unsigned 479 integer. 480 481 @raises dns.exception.SyntaxError: 482 @rtype: int 483 """ 484 485 value = self.get_int() 486 if value < 0 or value > 65535: 487 raise dns.exception.SyntaxError( 488 '%d is not an unsigned 16-bit integer' % value) 489 return value
490
491 - def get_uint32(self):
492 """Read the next token and interpret it as a 32-bit unsigned 493 integer. 494 495 @raises dns.exception.SyntaxError: 496 @rtype: int 497 """ 498 499 token = self.get().unescape() 500 if not token.is_identifier(): 501 raise dns.exception.SyntaxError('expecting an identifier') 502 if not token.value.isdigit(): 503 raise dns.exception.SyntaxError('expecting an integer') 504 value = long(token.value) 505 if value < 0 or value > long(4294967296): 506 raise dns.exception.SyntaxError( 507 '%d is not an unsigned 32-bit integer' % value) 508 return value
509
510 - def get_string(self, origin=None):
511 """Read the next token and interpret it as a string. 512 513 @raises dns.exception.SyntaxError: 514 @rtype: string 515 """ 516 517 token = self.get().unescape() 518 if not (token.is_identifier() or token.is_quoted_string()): 519 raise dns.exception.SyntaxError('expecting a string') 520 return token.value
521
522 - def get_identifier(self, origin=None):
523 """Read the next token and raise an exception if it is not an identifier. 524 525 @raises dns.exception.SyntaxError: 526 @rtype: string 527 """ 528 529 token = self.get().unescape() 530 if not token.is_identifier(): 531 raise dns.exception.SyntaxError('expecting an identifier') 532 return token.value
533
534 - def get_name(self, origin=None):
535 """Read the next token and interpret it as a DNS name. 536 537 @raises dns.exception.SyntaxError: 538 @rtype: dns.name.Name object""" 539 540 token = self.get() 541 if not token.is_identifier(): 542 raise dns.exception.SyntaxError('expecting an identifier') 543 return dns.name.from_text(token.value, origin)
544
545 - def get_eol(self):
546 """Read the next token and raise an exception if it isn't EOL or 547 EOF. 548 549 @raises dns.exception.SyntaxError: 550 @rtype: string 551 """ 552 553 token = self.get() 554 if not token.is_eol_or_eof(): 555 raise dns.exception.SyntaxError( 556 'expected EOL or EOF, got %d "%s"' % (token.ttype, 557 token.value)) 558 return token.value
559
560 - def get_ttl(self):
561 token = self.get().unescape() 562 if not token.is_identifier(): 563 raise dns.exception.SyntaxError('expecting an identifier') 564 return dns.ttl.from_text(token.value)
565