Package pyxb :: Package utils :: Module unicode
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.unicode

  1  # Copyright 2009, Peter A. Bigot 
  2  # 
  3  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  4  # not use this file except in compliance with the License. You may obtain a 
  5  # copy of the License at: 
  6  # 
  7  #            http://www.apache.org/licenses/LICENSE-2.0 
  8  # 
  9  # Unless required by applicable law or agreed to in writing, software 
 10  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 11  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 12  # License for the specific language governing permissions and limitations 
 13  # under the License. 
 14   
 15  """This module contains support for Unicode characters as required to 
 16  support the regular expression syntax defined in U{annex F 
 17  <http://www/Documentation/W3C/www.w3.org/TR/xmlschema-2/index.html#regexs>} 
 18  of the XML Schema definition. 
 19   
 20  In particular, we need to be able to identify character properties and 
 21  block escapes, as defined in F.1.1, by name. 
 22   
 23   - Block data: U{http://www.unicode.org/Public/3.1-Update/Blocks-4.txt} 
 24   - Property list data: U{http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.txt} 
 25   - Full dataset: U{http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.txt} 
 26   
 27  The Unicode database active at the time XML Schema 1.0 was defined is 
 28  archived at 
 29  U{http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html}, 
 30  and refers to U{Unicode Standard Annex #27: Unicode 3.1 
 31  <http://www.unicode.org/unicode/reports/tr27/>}. 
 32  """ 
 33   
 34  import re 
 35   
 36  SupportsWideUnicode = False 
 37  try: 
 38      re.compile('[\U1d7ce-\U1d7ff]') 
 39      SupportsWideUnicode = True 
 40  except: 
 41      pass 
 42   
 43  import bisect 
 44           
45 -class CodePointSetError (LookupError):
46 """Raised when some abuse of a L{CodePointSet} is detected.""" 47 pass
48
49 -class CodePointSet (object):
50 """Represent a set of Unicode code points. 51 52 Each code point is an integral value between 0 and 0x10FFFF. This 53 class is used to represent a set of code points in a manner 54 suitable for use as regular expression character sets.""" 55 56 MaxCodePoint = 0x10FFFF 57 """The maximum value for a code point in the Unicode code point 58 space. This is normally 0xFFFF, because wide unicode characters 59 are generally not enabled in Python builds. If, however, they are 60 enabled, this will be the full value of 0x10FFFF.""" 61 62 MaxShortCodePoint = 0xFFFF 63 if not SupportsWideUnicode: 64 MaxCodePoint = MaxShortCodePoint 65 66 # The internal representation of the codepoints is as a sorted 67 # list where values at an even index denote the first codepoint in 68 # a range that is in the set, and the immediately following value 69 # indicates the next following codepoint that is not in the set. 70 # A missing value at the end is interpreted as MaxCodePoint. For 71 # example, the sequence [ 12, 15, 200 ] denotes the set containing 72 # codepoints 12, 13, 14, and everything above 199. 73 __codepoints = None 74
75 - def _codepoints (self):
76 """For testing purrposes only, access to the codepoints 77 internal representation.""" 78 return self.__codepoints
79
80 - def __cmp__ (self, other):
81 """Equality is delegated to the codepoints list.""" 82 return cmp(self.__codepoints, other.__codepoints)
83
84 - def __init__ (self, *args):
85 self.__codepoints = [] 86 if 1 == len(args): 87 if isinstance(args[0], CodePointSet): 88 self.__codepoints.extend(args[0].__codepoints) 89 return 90 if isinstance(args[0], list): 91 args = args[0] 92 [ self.add(_a) for _a in args ]
93 94
95 - def __mutate (self, value, do_add):
96 # Identify the start (inclusive) and end (exclusive) code 97 # points of the value's range. 98 if isinstance(value, tuple): 99 (s, e) = value 100 e += 1 101 elif isinstance(value, basestring): 102 if 1 < len(value): 103 raise TypeError() 104 s = ord(value) 105 e = s+1 106 else: 107 s = int(value) 108 e = s+1 109 if s > e: 110 raise ValueError('codepoint range value order') 111 112 # Validate the range for the code points supported by this 113 # Python interpreter 114 if s > self.MaxCodePoint: 115 return self 116 if e > self.MaxCodePoint: 117 e = self.MaxCodePoint 118 e = min(e, self.MaxCodePoint) 119 120 # Index of first code point equal to or greater than s 121 li = bisect.bisect_left(self.__codepoints, s) 122 # Index of last code point less than or equal to e 123 ri = bisect.bisect_right(self.__codepoints, e) 124 # There are four cases; if we're subtracting, they reflect. 125 case = ((li & 1) << 1) | (ri & 1) 126 if not do_add: 127 case = 3 - case 128 #print 'add %d %d to %s at %d %d' % (s, e, self.__codepoints, li, ri) 129 if 0x03 == case: 130 # Add: Incoming value begins and ends within existing ranges 131 del self.__codepoints[li:ri] 132 elif 0x02 == case: 133 # Add: Incoming value extends into an excluded range 134 del self.__codepoints[li+1:ri] 135 self.__codepoints[li] = e 136 elif 0x01 == case: 137 # Add: Incoming value begins in an excluded range 138 del self.__codepoints[li+1:ri] 139 self.__codepoints[li] = s 140 else: 141 # Add: Incoming value begins and ends within excluded ranges 142 self.__codepoints[li:ri] = [s, e] 143 return self
144
145 - def add (self, value):
146 """Add the given value to the code point set. 147 148 @param value: An integral value denoting a code point, or a 149 tuple C{(s,e)} denoting the start and end (inclusive) code 150 points in a range. 151 @return: C{self}""" 152 return self.__mutate(value, True)
153
154 - def extend (self, values):
155 """Add multiple values to a code point set. 156 157 @param values: Either a L{CodePointSet} instance, or an iterable 158 whose members are valid parameters to L{add}. 159 160 @return: C{self}""" 161 if isinstance(values, CodePointSet): 162 self.extend(values.asTuples()) 163 else: 164 [ self.__mutate(_v, True) for _v in values ] 165 return self
166
167 - def subtract (self, value):
168 """Remove the given value from the code point set. 169 170 @param value: An integral value denoting a code point, or a tuple 171 C{(s,e)} denoting the start and end (inclusive) code points in a 172 range, or a L{CodePointSet}. 173 174 @return: C{self}""" 175 if isinstance(value, CodePointSet): 176 [ self.subtract(_v) for _v in value.asTuples() ] 177 return self 178 return self.__mutate(value, False)
179 180 # Characters that must not appear unescaped in regular expression 181 # patterns 182 __NotXMLChar_set = frozenset([ '-', '[', ']' ]) 183 184 # Return the given code point as a unicode character suitable for 185 # use in a regular expression
186 - def __unichr (self, code_point):
187 rv = unichr(code_point) 188 if rv in self.__NotXMLChar_set: 189 rv = u'\\' + rv 190 return rv
191
192 - def asPattern (self, with_brackets=True):
193 """Return the code point set as Unicode regular expression 194 character group consisting of a sequence of characters or 195 character ranges. 196 197 @param with_brackets: If C{True} (default), square brackets 198 are added to enclose the returned character group.""" 199 rva = [] 200 if with_brackets: 201 rva.append(u'[') 202 for (s, e) in self.asTuples(): 203 if s == e: 204 rva.append(self.__unichr(s)) 205 else: 206 rva.extend([self.__unichr(s), '-', self.__unichr(e)]) 207 if with_brackets: 208 rva.append(u']') 209 return u''.join(rva)
210
211 - def asTuples (self):
212 """Return the codepoints as tuples denoting the ranges that are in 213 the set. 214 215 Each tuple C{(s, e)} indicates that the code points from C{s} 216 (inclusive) to C{e}) (inclusive) are in the set.""" 217 218 rv = [] 219 start = None 220 for ri in xrange(len(self.__codepoints)): 221 if start is not None: 222 rv.append( (start, self.__codepoints[ri]-1) ) 223 start = None 224 else: 225 start = self.__codepoints[ri] 226 if start is not None: 227 rv.append( (start, self.MaxCodePoint) ) 228 return rv
229
230 - def negate (self):
231 """Return an instance that represents the inverse of this set.""" 232 rv = type(self)() 233 if (0 < len(self.__codepoints)) and (0 == self.__codepoints[0]): 234 rv.__codepoints.extend(self.__codepoints[1:]) 235 else: 236 rv.__codepoints.append(0) 237 rv.__codepoints.extend(self.__codepoints) 238 return rv
239
240 - def asSingleCharacter (self):
241 """If this set represents a single character, return it as its 242 unicode string value.""" 243 if (2 != len(self.__codepoints)) or (1 < (self.__codepoints[1] - self.__codepoints[0])): 244 raise CodePointSetError('CodePointSet does not represent single character') 245 return unichr(self.__codepoints[0])
246 247 from unicode_data import * 248 249 # Some of the MultiCharEsc classes refer to the U{NameChar 250 # <http://www.w3.org/TR/REC-xml/#NT-NameChar>} production for base 251 # XML. The XMLSchema 1.0 definition refers to the 2nd edition of XML, 252 # which defines in Annex B the set of relevant character classes based 253 # on Unicode 2.0. However, the current (fifth, at this writing) 254 # edition uses a much simpler characterization, and I'm going with 255 # that one. 256 _NameStartChar = CodePointSet(ord(':'), 257 ( ord('A'), ord('Z') ), 258 ord('_'), 259 ( ord('a'), ord('z') ), 260 ( 0xC0, 0xD6 ), 261 ( 0xD8, 0xF6 ), 262 ( 0xF8, 0x2FF ), 263 ( 0x370, 0x37D ), 264 ( 0x37F, 0x1FFF ), 265 ( 0x200C, 0x200D ), 266 ( 0x2070, 0x218F ), 267 ( 0x2C00, 0x2FEF ), 268 ( 0x3001, 0xD7FF ), 269 ( 0xF900, 0xFDCF ), 270 ( 0xFDF0, 0xFFFD ), 271 ( 0x10000, 0xEFFFF ) ) 272 273 # Add in characters that can appear in names, just not at the start. 274 _NameChar = CodePointSet(_NameStartChar).extend([ ord('-'), 275 ord('.'), 276 ( ord('0'), ord('9') ), 277 0xB7, 278 ( 0x0300, 0x036F ), 279 ( 0x203F, 0x2040 ) ]) 280 281 # Production 24 : Single Character Escapes 282 SingleCharEsc = { 'n' : CodePointSet(0x0A), 283 'r' : CodePointSet(0x0D), 284 't' : CodePointSet(0x09) } 285 for c in r'\|.-^?*+{}()[]': 286 SingleCharEsc[c] = CodePointSet(ord(c)) 287 288 # Production 37 : Multi-Character Escapes 289 WildcardEsc = CodePointSet(ord('\n'), ord('\r')).negate() 290 MultiCharEsc = { } 291 MultiCharEsc['s'] = CodePointSet(0x20, ord('\t'), ord('\n'), ord('\r')) 292 MultiCharEsc['S'] = MultiCharEsc['s'].negate() 293 MultiCharEsc['i'] = _NameStartChar 294 MultiCharEsc['I'] = MultiCharEsc['i'].negate() 295 MultiCharEsc['c'] = _NameChar 296 MultiCharEsc['C'] = MultiCharEsc['c'].negate() 297 MultiCharEsc['d'] = PropertyMap['Nd'] 298 MultiCharEsc['D'] = MultiCharEsc['d'].negate() 299 MultiCharEsc['W'] = CodePointSet(PropertyMap['P']).extend(PropertyMap['Z']).extend(PropertyMap['C']) 300 MultiCharEsc['w'] = MultiCharEsc['W'].negate() 301