Package pyxb :: Package utils :: Module unicode
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.unicode

  1  # Copyright 2009-2012, Peter A. Bigot 
  2  # 
  3  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  4  # not use this file except in compliance with the License. You may obtain a 
  5  # copy of the License at: 
  6  # 
  7  #            http://www.apache.org/licenses/LICENSE-2.0 
  8  # 
  9  # Unless required by applicable law or agreed to in writing, software 
 10  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 11  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 12  # License for the specific language governing permissions and limitations 
 13  # under the License. 
 14   
 15  """This module contains support for Unicode characters as required to 
 16  support the regular expression syntax defined in U{annex F 
 17  <http://www/Documentation/W3C/www.w3.org/TR/xmlschema-2/index.html#regexs>} 
 18  of the XML Schema definition. 
 19   
 20  In particular, we need to be able to identify character properties and 
 21  block escapes, as defined in F.1.1, by name. 
 22   
 23   - Block data: U{http://www.unicode.org/Public/3.1-Update/Blocks-4.txt} 
 24   - Property list data: U{http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.txt} 
 25   - Full dataset: U{http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.txt} 
 26   
 27  The Unicode database active at the time XML Schema 1.0 was defined is 
 28  archived at 
 29  U{http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html}, 
 30  and refers to U{Unicode Standard Annex #27: Unicode 3.1 
 31  <http://www.unicode.org/unicode/reports/tr27/>}. 
 32  """ 
 33   
 34  import re 
 35   
 36  SupportsWideUnicode = False 
 37  try: 
 38      re.compile(u'[\U0001d7ce-\U0001d7ff]') 
 39      SupportsWideUnicode = True 
 40  except: 
 41      pass 
 42   
 43  import bisect 
 44           
45 -class CodePointSetError (LookupError):
46 """Raised when some abuse of a L{CodePointSet} is detected.""" 47 pass
48
49 -class CodePointSet (object):
50 """Represent a set of Unicode code points. 51 52 Each code point is an integral value between 0 and 0x10FFFF. This 53 class is used to represent a set of code points in a manner 54 suitable for use as regular expression character sets.""" 55 56 MaxCodePoint = 0x10FFFF 57 """The maximum value for a code point in the Unicode code point 58 space. This is normally 0xFFFF, because wide unicode characters 59 are generally not enabled in Python builds. If, however, they are 60 enabled, this will be the full value of 0x10FFFF.""" 61 62 MaxShortCodePoint = 0xFFFF 63 if not SupportsWideUnicode: 64 MaxCodePoint = MaxShortCodePoint 65 66 # The internal representation of the codepoints is as a sorted 67 # list where values at an even index denote the first codepoint in 68 # a range that is in the set, and the immediately following value 69 # indicates the next following codepoint that is not in the set. 70 # A missing value at the end is interpreted as MaxCodePoint. For 71 # example, the sequence [ 12, 15, 200 ] denotes the set containing 72 # codepoints 12, 13, 14, and everything above 199. 73 __codepoints = None 74
75 - def _codepoints (self):
76 """For testing purrposes only, access to the codepoints 77 internal representation.""" 78 return self.__codepoints
79
80 - def __cmp__ (self, other):
81 """Equality is delegated to the codepoints list.""" 82 return cmp(self.__codepoints, other.__codepoints)
83
84 - def __init__ (self, *args):
85 self.__codepoints = [] 86 if 1 == len(args): 87 if isinstance(args[0], CodePointSet): 88 self.__codepoints.extend(args[0].__codepoints) 89 return 90 if isinstance(args[0], list): 91 args = args[0] 92 [ self.add(_a) for _a in args ]
93 94
95 - def __mutate (self, value, do_add):
96 # Identify the start (inclusive) and end (exclusive) code 97 # points of the value's range. 98 if isinstance(value, tuple): 99 (s, e) = value 100 e += 1 101 elif isinstance(value, basestring): 102 if 1 < len(value): 103 raise TypeError() 104 s = ord(value) 105 e = s+1 106 else: 107 s = int(value) 108 e = s+1 109 if s > e: 110 raise ValueError('codepoint range value order') 111 112 # Validate the range for the code points supported by this 113 # Python interpreter 114 if s > self.MaxCodePoint: 115 return self 116 if e > self.MaxCodePoint: 117 e = self.MaxCodePoint 118 e = min(e, self.MaxCodePoint) 119 120 # Index of first code point equal to or greater than s 121 li = bisect.bisect_left(self.__codepoints, s) 122 # Index of last code point less than or equal to e 123 ri = bisect.bisect_right(self.__codepoints, e) 124 # There are four cases; if we're subtracting, they reflect. 125 case = ((li & 1) << 1) | (ri & 1) 126 if not do_add: 127 case = 3 - case 128 #print 'add %d %d to %s at %d %d' % (s, e, self.__codepoints, li, ri) 129 if 0x03 == case: 130 # Add: Incoming value begins and ends within existing ranges 131 del self.__codepoints[li:ri] 132 elif 0x02 == case: 133 # Add: Incoming value extends into an excluded range 134 del self.__codepoints[li+1:ri] 135 self.__codepoints[li] = e 136 elif 0x01 == case: 137 # Add: Incoming value begins in an excluded range 138 del self.__codepoints[li+1:ri] 139 self.__codepoints[li] = s 140 else: 141 # Add: Incoming value begins and ends within excluded ranges 142 self.__codepoints[li:ri] = [s, e] 143 return self
144
145 - def add (self, value):
146 """Add the given value to the code point set. 147 148 @param value: An integral value denoting a code point, or a 149 tuple C{(s,e)} denoting the start and end (inclusive) code 150 points in a range. 151 @return: C{self}""" 152 return self.__mutate(value, True)
153
154 - def extend (self, values):
155 """Add multiple values to a code point set. 156 157 @param values: Either a L{CodePointSet} instance, or an iterable 158 whose members are valid parameters to L{add}. 159 160 @return: C{self}""" 161 if isinstance(values, CodePointSet): 162 self.extend(values.asTuples()) 163 else: 164 [ self.__mutate(_v, True) for _v in values ] 165 return self
166
167 - def subtract (self, value):
168 """Remove the given value from the code point set. 169 170 @param value: An integral value denoting a code point, or a tuple 171 C{(s,e)} denoting the start and end (inclusive) code points in a 172 range, or a L{CodePointSet}. 173 174 @return: C{self}""" 175 if isinstance(value, CodePointSet): 176 [ self.subtract(_v) for _v in value.asTuples() ] 177 return self 178 return self.__mutate(value, False)
179 180 # Characters that must not appear unescaped in regular expression 181 # patterns 182 __NotXMLChar_set = frozenset([ '-', '[', ']' ]) 183 184 # Return the given code point as a unicode character suitable for 185 # use in a regular expression
186 - def __unichr (self, code_point):
187 rv = unichr(code_point) 188 if rv in self.__NotXMLChar_set: 189 rv = u'\\' + rv 190 return rv
191
192 - def asPattern (self, with_brackets=True):
193 """Return the code point set as Unicode regular expression 194 character group consisting of a sequence of characters or 195 character ranges. 196 197 @param with_brackets: If C{True} (default), square brackets 198 are added to enclose the returned character group.""" 199 rva = [] 200 if with_brackets: 201 rva.append(u'[') 202 for (s, e) in self.asTuples(): 203 if s == e: 204 rva.append(self.__unichr(s)) 205 else: 206 rva.extend([self.__unichr(s), '-', self.__unichr(e)]) 207 if with_brackets: 208 rva.append(u']') 209 return u''.join(rva)
210
211 - def asTuples (self):
212 """Return the codepoints as tuples denoting the ranges that are in 213 the set. 214 215 Each tuple C{(s, e)} indicates that the code points from C{s} 216 (inclusive) to C{e}) (inclusive) are in the set.""" 217 218 rv = [] 219 start = None 220 for ri in xrange(len(self.__codepoints)): 221 if start is not None: 222 rv.append( (start, self.__codepoints[ri]-1) ) 223 start = None 224 else: 225 start = self.__codepoints[ri] 226 if start is not None: 227 rv.append( (start, self.MaxCodePoint) ) 228 return rv
229
230 - def negate (self):
231 """Return an instance that represents the inverse of this set.""" 232 rv = type(self)() 233 if (0 < len(self.__codepoints)) and (0 == self.__codepoints[0]): 234 rv.__codepoints.extend(self.__codepoints[1:]) 235 else: 236 rv.__codepoints.append(0) 237 rv.__codepoints.extend(self.__codepoints) 238 return rv
239
240 - def asSingleCharacter (self):
241 """If this set represents a single character, return it as its 242 unicode string value.""" 243 if (2 != len(self.__codepoints)) or (1 < (self.__codepoints[1] - self.__codepoints[0])): 244 raise CodePointSetError('CodePointSet does not represent single character') 245 return unichr(self.__codepoints[0])
246 247 from unicode_data import * 248
249 -class XML1p0e2 (object):
250 """Regular expression support for XML Schema Data Types. 251 252 This class holds character classes and regular expressions used to 253 constrain the lexical space of XML Schema datatypes derived from 254 U{string<http://www.w3.org/TR/xmlschema-2/#string>}. They are 255 from U{XML 1.0 (Second 256 Edition)<http://www.w3.org/TR/2000/WD-xml-2e-20000814>} and 257 U{Namespaces in XML 258 <http://www.w3.org/TR/1999/REC-xml-names-19990114/>}. 259 260 Unlike the regular expressions used for pattern constraints in XML 261 Schema, which are derived from the Unicode 3.1 specification, 262 these are derived from the Unicode 2.0 specification. 263 264 The XML Schema definition refers explicitly to the second edition 265 of XML, so we have to use these code point sets and patterns. Be 266 aware that U{subsequent updates to the XML specification 267 <http://www.w3.org/XML/xml-V10-4e-errata#E09>} have changed the 268 corresponding patterns for other uses of XML. One significant 269 change is that the original specification, used here, does not 270 allow wide unicode characters.""" 271 272 Char = CodePointSet( 273 0x0009, 274 0x000A, 275 0x000D, 276 ( 0x0020, 0xD7FF ), 277 ( 0xE000, 0xFFFD ) 278 ) 279 if SupportsWideUnicode: 280 Char.extend( ( 0x10000, 0x10FFFF ) ) 281 282 BaseChar = CodePointSet( 283 ( 0x0041, 0x005A ), 284 ( 0x0061, 0x007A ), 285 ( 0x00C0, 0x00D6 ), 286 ( 0x00D8, 0x00F6 ), 287 ( 0x00F8, 0x00FF ), 288 ( 0x0100, 0x0131 ), 289 ( 0x0134, 0x013E ), 290 ( 0x0141, 0x0148 ), 291 ( 0x014A, 0x017E ), 292 ( 0x0180, 0x01C3 ), 293 ( 0x01CD, 0x01F0 ), 294 ( 0x01F4, 0x01F5 ), 295 ( 0x01FA, 0x0217 ), 296 ( 0x0250, 0x02A8 ), 297 ( 0x02BB, 0x02C1 ), 298 0x0386, 299 ( 0x0388, 0x038A ), 300 0x038C, 301 ( 0x038E, 0x03A1 ), 302 ( 0x03A3, 0x03CE ), 303 ( 0x03D0, 0x03D6 ), 304 0x03DA, 305 0x03DC, 306 0x03DE, 307 0x03E0, 308 ( 0x03E2, 0x03F3 ), 309 ( 0x0401, 0x040C ), 310 ( 0x040E, 0x044F ), 311 ( 0x0451, 0x045C ), 312 ( 0x045E, 0x0481 ), 313 ( 0x0490, 0x04C4 ), 314 ( 0x04C7, 0x04C8 ), 315 ( 0x04CB, 0x04CC ), 316 ( 0x04D0, 0x04EB ), 317 ( 0x04EE, 0x04F5 ), 318 ( 0x04F8, 0x04F9 ), 319 ( 0x0531, 0x0556 ), 320 0x0559, 321 ( 0x0561, 0x0586 ), 322 ( 0x05D0, 0x05EA ), 323 ( 0x05F0, 0x05F2 ), 324 ( 0x0621, 0x063A ), 325 ( 0x0641, 0x064A ), 326 ( 0x0671, 0x06B7 ), 327 ( 0x06BA, 0x06BE ), 328 ( 0x06C0, 0x06CE ), 329 ( 0x06D0, 0x06D3 ), 330 0x06D5, 331 ( 0x06E5, 0x06E6 ), 332 ( 0x0905, 0x0939 ), 333 0x093D, 334 ( 0x0958, 0x0961 ), 335 ( 0x0985, 0x098C ), 336 ( 0x098F, 0x0990 ), 337 ( 0x0993, 0x09A8 ), 338 ( 0x09AA, 0x09B0 ), 339 0x09B2, 340 ( 0x09B6, 0x09B9 ), 341 ( 0x09DC, 0x09DD ), 342 ( 0x09DF, 0x09E1 ), 343 ( 0x09F0, 0x09F1 ), 344 ( 0x0A05, 0x0A0A ), 345 ( 0x0A0F, 0x0A10 ), 346 ( 0x0A13, 0x0A28 ), 347 ( 0x0A2A, 0x0A30 ), 348 ( 0x0A32, 0x0A33 ), 349 ( 0x0A35, 0x0A36 ), 350 ( 0x0A38, 0x0A39 ), 351 ( 0x0A59, 0x0A5C ), 352 0x0A5E, 353 ( 0x0A72, 0x0A74 ), 354 ( 0x0A85, 0x0A8B ), 355 0x0A8D, 356 ( 0x0A8F, 0x0A91 ), 357 ( 0x0A93, 0x0AA8 ), 358 ( 0x0AAA, 0x0AB0 ), 359 ( 0x0AB2, 0x0AB3 ), 360 ( 0x0AB5, 0x0AB9 ), 361 0x0ABD, 362 0x0AE0, 363 ( 0x0B05, 0x0B0C ), 364 ( 0x0B0F, 0x0B10 ), 365 ( 0x0B13, 0x0B28 ), 366 ( 0x0B2A, 0x0B30 ), 367 ( 0x0B32, 0x0B33 ), 368 ( 0x0B36, 0x0B39 ), 369 0x0B3D, 370 ( 0x0B5C, 0x0B5D ), 371 ( 0x0B5F, 0x0B61 ), 372 ( 0x0B85, 0x0B8A ), 373 ( 0x0B8E, 0x0B90 ), 374 ( 0x0B92, 0x0B95 ), 375 ( 0x0B99, 0x0B9A ), 376 0x0B9C, 377 ( 0x0B9E, 0x0B9F ), 378 ( 0x0BA3, 0x0BA4 ), 379 ( 0x0BA8, 0x0BAA ), 380 ( 0x0BAE, 0x0BB5 ), 381 ( 0x0BB7, 0x0BB9 ), 382 ( 0x0C05, 0x0C0C ), 383 ( 0x0C0E, 0x0C10 ), 384 ( 0x0C12, 0x0C28 ), 385 ( 0x0C2A, 0x0C33 ), 386 ( 0x0C35, 0x0C39 ), 387 ( 0x0C60, 0x0C61 ), 388 ( 0x0C85, 0x0C8C ), 389 ( 0x0C8E, 0x0C90 ), 390 ( 0x0C92, 0x0CA8 ), 391 ( 0x0CAA, 0x0CB3 ), 392 ( 0x0CB5, 0x0CB9 ), 393 0x0CDE, 394 ( 0x0CE0, 0x0CE1 ), 395 ( 0x0D05, 0x0D0C ), 396 ( 0x0D0E, 0x0D10 ), 397 ( 0x0D12, 0x0D28 ), 398 ( 0x0D2A, 0x0D39 ), 399 ( 0x0D60, 0x0D61 ), 400 ( 0x0E01, 0x0E2E ), 401 0x0E30, 402 ( 0x0E32, 0x0E33 ), 403 ( 0x0E40, 0x0E45 ), 404 ( 0x0E81, 0x0E82 ), 405 0x0E84, 406 ( 0x0E87, 0x0E88 ), 407 0x0E8A, 408 0x0E8D, 409 ( 0x0E94, 0x0E97 ), 410 ( 0x0E99, 0x0E9F ), 411 ( 0x0EA1, 0x0EA3 ), 412 0x0EA5, 413 0x0EA7, 414 ( 0x0EAA, 0x0EAB ), 415 ( 0x0EAD, 0x0EAE ), 416 0x0EB0, 417 ( 0x0EB2, 0x0EB3 ), 418 0x0EBD, 419 ( 0x0EC0, 0x0EC4 ), 420 ( 0x0F40, 0x0F47 ), 421 ( 0x0F49, 0x0F69 ), 422 ( 0x10A0, 0x10C5 ), 423 ( 0x10D0, 0x10F6 ), 424 0x1100, 425 ( 0x1102, 0x1103 ), 426 ( 0x1105, 0x1107 ), 427 0x1109, 428 ( 0x110B, 0x110C ), 429 ( 0x110E, 0x1112 ), 430 0x113C, 431 0x113E, 432 0x1140, 433 0x114C, 434 0x114E, 435 0x1150, 436 ( 0x1154, 0x1155 ), 437 0x1159, 438 ( 0x115F, 0x1161 ), 439 0x1163, 440 0x1165, 441 0x1167, 442 0x1169, 443 ( 0x116D, 0x116E ), 444 ( 0x1172, 0x1173 ), 445 0x1175, 446 0x119E, 447 0x11A8, 448 0x11AB, 449 ( 0x11AE, 0x11AF ), 450 ( 0x11B7, 0x11B8 ), 451 0x11BA, 452 ( 0x11BC, 0x11C2 ), 453 0x11EB, 454 0x11F0, 455 0x11F9, 456 ( 0x1E00, 0x1E9B ), 457 ( 0x1EA0, 0x1EF9 ), 458 ( 0x1F00, 0x1F15 ), 459 ( 0x1F18, 0x1F1D ), 460 ( 0x1F20, 0x1F45 ), 461 ( 0x1F48, 0x1F4D ), 462 ( 0x1F50, 0x1F57 ), 463 0x1F59, 464 0x1F5B, 465 0x1F5D, 466 ( 0x1F5F, 0x1F7D ), 467 ( 0x1F80, 0x1FB4 ), 468 ( 0x1FB6, 0x1FBC ), 469 0x1FBE, 470 ( 0x1FC2, 0x1FC4 ), 471 ( 0x1FC6, 0x1FCC ), 472 ( 0x1FD0, 0x1FD3 ), 473 ( 0x1FD6, 0x1FDB ), 474 ( 0x1FE0, 0x1FEC ), 475 ( 0x1FF2, 0x1FF4 ), 476 ( 0x1FF6, 0x1FFC ), 477 0x2126, 478 ( 0x212A, 0x212B ), 479 0x212E, 480 ( 0x2180, 0x2182 ), 481 ( 0x3041, 0x3094 ), 482 ( 0x30A1, 0x30FA ), 483 ( 0x3105, 0x312C ), 484 ( 0xAC00, 0xD7A3 ) 485 ) 486 487 Ideographic = CodePointSet( 488 ( 0x4E00, 0x9FA5 ), 489 0x3007, 490 ( 0x3021, 0x3029 ) 491 ) 492 493 Letter = CodePointSet(BaseChar).extend(Ideographic); 494 495 CombiningChar = CodePointSet( 496 ( 0x0300, 0x0345 ), 497 ( 0x0360, 0x0361 ), 498 ( 0x0483, 0x0486 ), 499 ( 0x0591, 0x05A1 ), 500 ( 0x05A3, 0x05B9 ), 501 ( 0x05BB, 0x05BD ), 502 0x05BF, 503 ( 0x05C1, 0x05C2 ), 504 0x05C4, 505 ( 0x064B, 0x0652 ), 506 0x0670, 507 ( 0x06D6, 0x06DC ), 508 ( 0x06DD, 0x06DF ), 509 ( 0x06E0, 0x06E4 ), 510 ( 0x06E7, 0x06E8 ), 511 ( 0x06EA, 0x06ED ), 512 ( 0x0901, 0x0903 ), 513 0x093C, 514 ( 0x093E, 0x094C ), 515 0x094D, 516 ( 0x0951, 0x0954 ), 517 ( 0x0962, 0x0963 ), 518 ( 0x0981, 0x0983 ), 519 0x09BC, 520 0x09BE, 521 0x09BF, 522 ( 0x09C0, 0x09C4 ), 523 ( 0x09C7, 0x09C8 ), 524 ( 0x09CB, 0x09CD ), 525 0x09D7, 526 ( 0x09E2, 0x09E3 ), 527 0x0A02, 528 0x0A3C, 529 0x0A3E, 530 0x0A3F, 531 ( 0x0A40, 0x0A42 ), 532 ( 0x0A47, 0x0A48 ), 533 ( 0x0A4B, 0x0A4D ), 534 ( 0x0A70, 0x0A71 ), 535 ( 0x0A81, 0x0A83 ), 536 0x0ABC, 537 ( 0x0ABE, 0x0AC5 ), 538 ( 0x0AC7, 0x0AC9 ), 539 ( 0x0ACB, 0x0ACD ), 540 ( 0x0B01, 0x0B03 ), 541 0x0B3C, 542 ( 0x0B3E, 0x0B43 ), 543 ( 0x0B47, 0x0B48 ), 544 ( 0x0B4B, 0x0B4D ), 545 ( 0x0B56, 0x0B57 ), 546 ( 0x0B82, 0x0B83 ), 547 ( 0x0BBE, 0x0BC2 ), 548 ( 0x0BC6, 0x0BC8 ), 549 ( 0x0BCA, 0x0BCD ), 550 0x0BD7, 551 ( 0x0C01, 0x0C03 ), 552 ( 0x0C3E, 0x0C44 ), 553 ( 0x0C46, 0x0C48 ), 554 ( 0x0C4A, 0x0C4D ), 555 ( 0x0C55, 0x0C56 ), 556 ( 0x0C82, 0x0C83 ), 557 ( 0x0CBE, 0x0CC4 ), 558 ( 0x0CC6, 0x0CC8 ), 559 ( 0x0CCA, 0x0CCD ), 560 ( 0x0CD5, 0x0CD6 ), 561 ( 0x0D02, 0x0D03 ), 562 ( 0x0D3E, 0x0D43 ), 563 ( 0x0D46, 0x0D48 ), 564 ( 0x0D4A, 0x0D4D ), 565 0x0D57, 566 0x0E31, 567 ( 0x0E34, 0x0E3A ), 568 ( 0x0E47, 0x0E4E ), 569 0x0EB1, 570 ( 0x0EB4, 0x0EB9 ), 571 ( 0x0EBB, 0x0EBC ), 572 ( 0x0EC8, 0x0ECD ), 573 ( 0x0F18, 0x0F19 ), 574 0x0F35, 575 0x0F37, 576 0x0F39, 577 0x0F3E, 578 0x0F3F, 579 ( 0x0F71, 0x0F84 ), 580 ( 0x0F86, 0x0F8B ), 581 ( 0x0F90, 0x0F95 ), 582 0x0F97, 583 ( 0x0F99, 0x0FAD ), 584 ( 0x0FB1, 0x0FB7 ), 585 0x0FB9, 586 ( 0x20D0, 0x20DC ), 587 0x20E1, 588 ( 0x302A, 0x302F ), 589 0x3099, 590 0x309A 591 ) 592 593 Digit = CodePointSet( 594 ( 0x0030, 0x0039 ), 595 ( 0x0660, 0x0669 ), 596 ( 0x06F0, 0x06F9 ), 597 ( 0x0966, 0x096F ), 598 ( 0x09E6, 0x09EF ), 599 ( 0x0A66, 0x0A6F ), 600 ( 0x0AE6, 0x0AEF ), 601 ( 0x0B66, 0x0B6F ), 602 ( 0x0BE7, 0x0BEF ), 603 ( 0x0C66, 0x0C6F ), 604 ( 0x0CE6, 0x0CEF ), 605 ( 0x0D66, 0x0D6F ), 606 ( 0x0E50, 0x0E59 ), 607 ( 0x0ED0, 0x0ED9 ), 608 ( 0x0F20, 0x0F29 ) 609 ) 610 611 Extender = CodePointSet( 612 0x00B7, 613 0x02D0, 614 0x02D1, 615 0x0387, 616 0x0640, 617 0x0E46, 618 0x0EC6, 619 0x3005, 620 ( 0x3031, 0x3035 ), 621 ( 0x309D, 0x309E ), 622 ( 0x30FC, 0x30FE ) 623 ) 624 625 # Not an explicit production, but used in Name production 626 NameStartChar = CodePointSet(Letter) 627 NameStartChar.add(ord('_')) 628 NameStartChar.add(ord(':')) 629 630 NCNameStartChar = CodePointSet(Letter) 631 NCNameStartChar.add(ord('_')) 632 633 NameChar = CodePointSet(Letter) 634 NameChar.extend(Digit) 635 NameChar.add(ord('.')) 636 NameChar.add(ord('-')) 637 NameChar.add(ord('_')) 638 NameChar.add(ord(':')) 639 NameChar.extend(CombiningChar) 640 NameChar.extend(Extender) 641 642 NCNameChar = CodePointSet(Letter) 643 NCNameChar.extend(Digit) 644 NCNameChar.add(ord('.')) 645 NCNameChar.add(ord('-')) 646 NCNameChar.add(ord('_')) 647 NCNameChar.extend(CombiningChar) 648 NCNameChar.extend(Extender) 649 650 Name_pat = '%s%s*' % (NameStartChar.asPattern(), NameChar.asPattern()) 651 Name_re = re.compile('^%s$' % (Name_pat,)) 652 NmToken_pat = '%s+' % (NameChar.asPattern(),) 653 NmToken_re = re.compile('^%s$' % (NmToken_pat,)) 654 NCName_pat = '%s%s*' % (NCNameStartChar.asPattern(), NCNameChar.asPattern()) 655 NCName_re = re.compile('^%s$' % (NCName_pat,)) 656 QName_pat = '(%s:)?%s' % (NCName_pat, NCName_pat) 657 QName_re = re.compile('^%s$' % (QName_pat,))
658 659 # Production 24 : Single Character Escapes 660 SingleCharEsc = { 'n' : CodePointSet(0x0A), 661 'r' : CodePointSet(0x0D), 662 't' : CodePointSet(0x09) } 663 for c in r'\|.-^?*+{}()[]': 664 SingleCharEsc[c] = CodePointSet(ord(c)) 665 666 # Production 37 : Multi-Character Escapes 667 WildcardEsc = CodePointSet(ord('\n'), ord('\r')).negate() 668 MultiCharEsc = { } 669 MultiCharEsc['s'] = CodePointSet(0x20, ord('\t'), ord('\n'), ord('\r')) 670 MultiCharEsc['S'] = MultiCharEsc['s'].negate() 671 MultiCharEsc['i'] = CodePointSet(XML1p0e2.Letter).add(ord('_')).add(ord(':')) 672 MultiCharEsc['I'] = MultiCharEsc['i'].negate() 673 MultiCharEsc['c'] = CodePointSet(XML1p0e2.NameChar) 674 MultiCharEsc['C'] = MultiCharEsc['c'].negate() 675 MultiCharEsc['d'] = PropertyMap['Nd'] 676 MultiCharEsc['D'] = MultiCharEsc['d'].negate() 677 MultiCharEsc['W'] = CodePointSet(PropertyMap['P']).extend(PropertyMap['Z']).extend(PropertyMap['C']) 678 MultiCharEsc['w'] = MultiCharEsc['W'].negate() 679