Package dns :: Module tokenizer
[hide private]
[frames] | no frames]

Source Code for Module dns.tokenizer

  1  # Copyright (C) Dnspython Contributors, see LICENSE for text of ISC license 
  2   
  3  # Copyright (C) 2003-2017 Nominum, Inc. 
  4  # 
  5  # Permission to use, copy, modify, and distribute this software and its 
  6  # documentation for any purpose with or without fee is hereby granted, 
  7  # provided that the above copyright notice and this permission notice 
  8  # appear in all copies. 
  9  # 
 10  # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES 
 11  # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 
 12  # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR 
 13  # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 
 14  # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
 15  # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 
 16  # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
 17   
 18  """Tokenize DNS master file format""" 
 19   
 20  from io import StringIO 
 21  import sys 
 22   
 23  import dns.exception 
 24  import dns.name 
 25  import dns.ttl 
 26  from ._compat import long, text_type, binary_type 
 27   
 28  _DELIMITERS = { 
 29      ' ': True, 
 30      '\t': True, 
 31      '\n': True, 
 32      ';': True, 
 33      '(': True, 
 34      ')': True, 
 35      '"': True} 
 36   
 37  _QUOTING_DELIMITERS = {'"': True} 
 38   
 39  EOF = 0 
 40  EOL = 1 
 41  WHITESPACE = 2 
 42  IDENTIFIER = 3 
 43  QUOTED_STRING = 4 
 44  COMMENT = 5 
 45  DELIMITER = 6 
 46   
 47   
48 -class UngetBufferFull(dns.exception.DNSException):
49 """An attempt was made to unget a token when the unget buffer was full."""
50 51
52 -class Token(object):
53 """A DNS master file format token. 54 55 ttype: The token type 56 value: The token value 57 has_escape: Does the token value contain escapes? 58 """ 59
60 - def __init__(self, ttype, value='', has_escape=False):
61 """Initialize a token instance.""" 62 63 self.ttype = ttype 64 self.value = value 65 self.has_escape = has_escape
66
67 - def is_eof(self):
68 return self.ttype == EOF
69
70 - def is_eol(self):
71 return self.ttype == EOL
72
73 - def is_whitespace(self):
74 return self.ttype == WHITESPACE
75
76 - def is_identifier(self):
77 return self.ttype == IDENTIFIER
78
79 - def is_quoted_string(self):
80 return self.ttype == QUOTED_STRING
81
82 - def is_comment(self):
83 return self.ttype == COMMENT
84
85 - def is_delimiter(self):
86 return self.ttype == DELIMITER
87
88 - def is_eol_or_eof(self):
89 return self.ttype == EOL or self.ttype == EOF
90
91 - def __eq__(self, other):
92 if not isinstance(other, Token): 93 return False 94 return (self.ttype == other.ttype and 95 self.value == other.value)
96
97 - def __ne__(self, other):
98 if not isinstance(other, Token): 99 return True 100 return (self.ttype != other.ttype or 101 self.value != other.value)
102
103 - def __str__(self):
104 return '%d "%s"' % (self.ttype, self.value)
105
106 - def unescape(self):
107 if not self.has_escape: 108 return self 109 unescaped = '' 110 l = len(self.value) 111 i = 0 112 while i < l: 113 c = self.value[i] 114 i += 1 115 if c == '\\': 116 if i >= l: 117 raise dns.exception.UnexpectedEnd 118 c = self.value[i] 119 i += 1 120 if c.isdigit(): 121 if i >= l: 122 raise dns.exception.UnexpectedEnd 123 c2 = self.value[i] 124 i += 1 125 if i >= l: 126 raise dns.exception.UnexpectedEnd 127 c3 = self.value[i] 128 i += 1 129 if not (c2.isdigit() and c3.isdigit()): 130 raise dns.exception.SyntaxError 131 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 132 unescaped += c 133 return Token(self.ttype, unescaped)
134 135 # compatibility for old-style tuple tokens 136
137 - def __len__(self):
138 return 2
139
140 - def __iter__(self):
141 return iter((self.ttype, self.value))
142
143 - def __getitem__(self, i):
144 if i == 0: 145 return self.ttype 146 elif i == 1: 147 return self.value 148 else: 149 raise IndexError
150 151
152 -class Tokenizer(object):
153 """A DNS master file format tokenizer. 154 155 A token object is basically a (type, value) tuple. The valid 156 types are EOF, EOL, WHITESPACE, IDENTIFIER, QUOTED_STRING, 157 COMMENT, and DELIMITER. 158 159 file: The file to tokenize 160 161 ungotten_char: The most recently ungotten character, or None. 162 163 ungotten_token: The most recently ungotten token, or None. 164 165 multiline: The current multiline level. This value is increased 166 by one every time a '(' delimiter is read, and decreased by one every time 167 a ')' delimiter is read. 168 169 quoting: This variable is true if the tokenizer is currently 170 reading a quoted string. 171 172 eof: This variable is true if the tokenizer has encountered EOF. 173 174 delimiters: The current delimiter dictionary. 175 176 line_number: The current line number 177 178 filename: A filename that will be returned by the where() method. 179 """ 180
181 - def __init__(self, f=sys.stdin, filename=None):
182 """Initialize a tokenizer instance. 183 184 f: The file to tokenize. The default is sys.stdin. 185 This parameter may also be a string, in which case the tokenizer 186 will take its input from the contents of the string. 187 188 filename: the name of the filename that the where() method 189 will return. 190 """ 191 192 if isinstance(f, text_type): 193 f = StringIO(f) 194 if filename is None: 195 filename = '<string>' 196 elif isinstance(f, binary_type): 197 f = StringIO(f.decode()) 198 if filename is None: 199 filename = '<string>' 200 else: 201 if filename is None: 202 if f is sys.stdin: 203 filename = '<stdin>' 204 else: 205 filename = '<file>' 206 self.file = f 207 self.ungotten_char = None 208 self.ungotten_token = None 209 self.multiline = 0 210 self.quoting = False 211 self.eof = False 212 self.delimiters = _DELIMITERS 213 self.line_number = 1 214 self.filename = filename
215
216 - def _get_char(self):
217 """Read a character from input. 218 """ 219 220 if self.ungotten_char is None: 221 if self.eof: 222 c = '' 223 else: 224 c = self.file.read(1) 225 if c == '': 226 self.eof = True 227 elif c == '\n': 228 self.line_number += 1 229 else: 230 c = self.ungotten_char 231 self.ungotten_char = None 232 return c
233
234 - def where(self):
235 """Return the current location in the input. 236 237 Returns a (string, int) tuple. The first item is the filename of 238 the input, the second is the current line number. 239 """ 240 241 return (self.filename, self.line_number)
242
243 - def _unget_char(self, c):
244 """Unget a character. 245 246 The unget buffer for characters is only one character large; it is 247 an error to try to unget a character when the unget buffer is not 248 empty. 249 250 c: the character to unget 251 raises UngetBufferFull: there is already an ungotten char 252 """ 253 254 if self.ungotten_char is not None: 255 raise UngetBufferFull 256 self.ungotten_char = c
257
258 - def skip_whitespace(self):
259 """Consume input until a non-whitespace character is encountered. 260 261 The non-whitespace character is then ungotten, and the number of 262 whitespace characters consumed is returned. 263 264 If the tokenizer is in multiline mode, then newlines are whitespace. 265 266 Returns the number of characters skipped. 267 """ 268 269 skipped = 0 270 while True: 271 c = self._get_char() 272 if c != ' ' and c != '\t': 273 if (c != '\n') or not self.multiline: 274 self._unget_char(c) 275 return skipped 276 skipped += 1
277
278 - def get(self, want_leading=False, want_comment=False):
279 """Get the next token. 280 281 want_leading: If True, return a WHITESPACE token if the 282 first character read is whitespace. The default is False. 283 284 want_comment: If True, return a COMMENT token if the 285 first token read is a comment. The default is False. 286 287 Raises dns.exception.UnexpectedEnd: input ended prematurely 288 289 Raises dns.exception.SyntaxError: input was badly formed 290 291 Returns a Token. 292 """ 293 294 if self.ungotten_token is not None: 295 token = self.ungotten_token 296 self.ungotten_token = None 297 if token.is_whitespace(): 298 if want_leading: 299 return token 300 elif token.is_comment(): 301 if want_comment: 302 return token 303 else: 304 return token 305 skipped = self.skip_whitespace() 306 if want_leading and skipped > 0: 307 return Token(WHITESPACE, ' ') 308 token = '' 309 ttype = IDENTIFIER 310 has_escape = False 311 while True: 312 c = self._get_char() 313 if c == '' or c in self.delimiters: 314 if c == '' and self.quoting: 315 raise dns.exception.UnexpectedEnd 316 if token == '' and ttype != QUOTED_STRING: 317 if c == '(': 318 self.multiline += 1 319 self.skip_whitespace() 320 continue 321 elif c == ')': 322 if self.multiline <= 0: 323 raise dns.exception.SyntaxError 324 self.multiline -= 1 325 self.skip_whitespace() 326 continue 327 elif c == '"': 328 if not self.quoting: 329 self.quoting = True 330 self.delimiters = _QUOTING_DELIMITERS 331 ttype = QUOTED_STRING 332 continue 333 else: 334 self.quoting = False 335 self.delimiters = _DELIMITERS 336 self.skip_whitespace() 337 continue 338 elif c == '\n': 339 return Token(EOL, '\n') 340 elif c == ';': 341 while 1: 342 c = self._get_char() 343 if c == '\n' or c == '': 344 break 345 token += c 346 if want_comment: 347 self._unget_char(c) 348 return Token(COMMENT, token) 349 elif c == '': 350 if self.multiline: 351 raise dns.exception.SyntaxError( 352 'unbalanced parentheses') 353 return Token(EOF) 354 elif self.multiline: 355 self.skip_whitespace() 356 token = '' 357 continue 358 else: 359 return Token(EOL, '\n') 360 else: 361 # This code exists in case we ever want a 362 # delimiter to be returned. It never produces 363 # a token currently. 364 token = c 365 ttype = DELIMITER 366 else: 367 self._unget_char(c) 368 break 369 elif self.quoting: 370 if c == '\\': 371 c = self._get_char() 372 if c == '': 373 raise dns.exception.UnexpectedEnd 374 if c.isdigit(): 375 c2 = self._get_char() 376 if c2 == '': 377 raise dns.exception.UnexpectedEnd 378 c3 = self._get_char() 379 if c == '': 380 raise dns.exception.UnexpectedEnd 381 if not (c2.isdigit() and c3.isdigit()): 382 raise dns.exception.SyntaxError 383 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 384 elif c == '\n': 385 raise dns.exception.SyntaxError('newline in quoted string') 386 elif c == '\\': 387 # 388 # It's an escape. Put it and the next character into 389 # the token; it will be checked later for goodness. 390 # 391 token += c 392 has_escape = True 393 c = self._get_char() 394 if c == '' or c == '\n': 395 raise dns.exception.UnexpectedEnd 396 token += c 397 if token == '' and ttype != QUOTED_STRING: 398 if self.multiline: 399 raise dns.exception.SyntaxError('unbalanced parentheses') 400 ttype = EOF 401 return Token(ttype, token, has_escape)
402
403 - def unget(self, token):
404 """Unget a token. 405 406 The unget buffer for tokens is only one token large; it is 407 an error to try to unget a token when the unget buffer is not 408 empty. 409 410 token: the token to unget 411 412 Raises UngetBufferFull: there is already an ungotten token 413 """ 414 415 if self.ungotten_token is not None: 416 raise UngetBufferFull 417 self.ungotten_token = token
418
419 - def next(self):
420 """Return the next item in an iteration. 421 422 Returns a Token. 423 """ 424 425 token = self.get() 426 if token.is_eof(): 427 raise StopIteration 428 return token
429 430 __next__ = next 431
432 - def __iter__(self):
433 return self
434 435 # Helpers 436
437 - def get_int(self, base=10):
438 """Read the next token and interpret it as an integer. 439 440 Raises dns.exception.SyntaxError if not an integer. 441 442 Returns an int. 443 """ 444 445 token = self.get().unescape() 446 if not token.is_identifier(): 447 raise dns.exception.SyntaxError('expecting an identifier') 448 if not token.value.isdigit(): 449 raise dns.exception.SyntaxError('expecting an integer') 450 return int(token.value, base)
451
452 - def get_uint8(self):
453 """Read the next token and interpret it as an 8-bit unsigned 454 integer. 455 456 Raises dns.exception.SyntaxError if not an 8-bit unsigned integer. 457 458 Returns an int. 459 """ 460 461 value = self.get_int() 462 if value < 0 or value > 255: 463 raise dns.exception.SyntaxError( 464 '%d is not an unsigned 8-bit integer' % value) 465 return value
466
467 - def get_uint16(self, base=10):
468 """Read the next token and interpret it as a 16-bit unsigned 469 integer. 470 471 Raises dns.exception.SyntaxError if not a 16-bit unsigned integer. 472 473 Returns an int. 474 """ 475 476 value = self.get_int(base=base) 477 if value < 0 or value > 65535: 478 if base == 8: 479 raise dns.exception.SyntaxError( 480 '%o is not an octal unsigned 16-bit integer' % value) 481 else: 482 raise dns.exception.SyntaxError( 483 '%d is not an unsigned 16-bit integer' % value) 484 return value
485
486 - def get_uint32(self):
487 """Read the next token and interpret it as a 32-bit unsigned 488 integer. 489 490 Raises dns.exception.SyntaxError if not a 32-bit unsigned integer. 491 492 Returns an int. 493 """ 494 495 token = self.get().unescape() 496 if not token.is_identifier(): 497 raise dns.exception.SyntaxError('expecting an identifier') 498 if not token.value.isdigit(): 499 raise dns.exception.SyntaxError('expecting an integer') 500 value = long(token.value) 501 if value < 0 or value > long(4294967296): 502 raise dns.exception.SyntaxError( 503 '%d is not an unsigned 32-bit integer' % value) 504 return value
505
506 - def get_string(self, origin=None):
507 """Read the next token and interpret it as a string. 508 509 Raises dns.exception.SyntaxError if not a string. 510 511 Returns a string. 512 """ 513 514 token = self.get().unescape() 515 if not (token.is_identifier() or token.is_quoted_string()): 516 raise dns.exception.SyntaxError('expecting a string') 517 return token.value
518
519 - def get_identifier(self, origin=None):
520 """Read the next token, which should be an identifier. 521 522 Raises dns.exception.SyntaxError if not an identifier. 523 524 Returns a string. 525 """ 526 527 token = self.get().unescape() 528 if not token.is_identifier(): 529 raise dns.exception.SyntaxError('expecting an identifier') 530 return token.value
531
532 - def get_name(self, origin=None):
533 """Read the next token and interpret it as a DNS name. 534 535 Raises dns.exception.SyntaxError if not a name. 536 537 Returns a dns.name.Name. 538 """ 539 540 token = self.get() 541 if not token.is_identifier(): 542 raise dns.exception.SyntaxError('expecting an identifier') 543 return dns.name.from_text(token.value, origin)
544
545 - def get_eol(self):
546 """Read the next token and raise an exception if it isn't EOL or 547 EOF. 548 549 Returns a string. 550 """ 551 552 token = self.get() 553 if not token.is_eol_or_eof(): 554 raise dns.exception.SyntaxError( 555 'expected EOL or EOF, got %d "%s"' % (token.ttype, 556 token.value)) 557 return token.value
558
559 - def get_ttl(self):
560 """Read the next token and interpret it as a DNS TTL. 561 562 Raises dns.exception.SyntaxError or dns.ttl.BadTTL if not an 563 identifier or badly formed. 564 565 Returns an int. 566 """ 567 568 token = self.get().unescape() 569 if not token.is_identifier(): 570 raise dns.exception.SyntaxError('expecting an identifier') 571 return dns.ttl.from_text(token.value)
572