pyxb.utils.unicode

50 """Represent a set of Unicode code points. 51 52 Each code point is an integral value between 0 and 0x10FFFF. This 53 class is used to represent a set of code points in a manner 54 suitable for use as regular expression character sets.""" 55 56 MaxCodePoint = 0x10FFFF 57 """The maximum value for a code point in the Unicode code point 58 space. This is normally 0xFFFF, because wide unicode characters 59 are generally not enabled in Python builds. If, however, they are 60 enabled, this will be the full value of 0x10FFFF.""" 61 62 MaxShortCodePoint = 0xFFFF 63 if not SupportsWideUnicode: 64 MaxCodePoint = MaxShortCodePoint 65 66 # The internal representation of the codepoints is as a sorted 67 # list where values at an even index denote the first codepoint in 68 # a range that is in the set, and the immediately following value 69 # indicates the next following codepoint that is not in the set. 70 # A missing value at the end is interpreted as MaxCodePoint. For 71 # example, the sequence [ 12, 15, 200 ] denotes the set containing 72 # codepoints 12, 13, 14, and everything above 199. 73 __codepoints = None 74

75 - def _codepoints (self):

76 """For testing purrposes only, access to the codepoints 77 internal representation.""" 78 return self.__codepoints

79

80 - def __cmp__ (self, other):

81 """Equality is delegated to the codepoints list.""" 82 return cmp(self.__codepoints, other.__codepoints)

83

84 - def __init__ (self, *args):

85 self.__codepoints = [] 86 if 1 == len(args): 87 if isinstance(args[0], CodePointSet): 88 self.__codepoints.extend(args[0].__codepoints) 89 return 90 if isinstance(args[0], list): 91 args = args[0] 92 [ self.add(_a) for _a in args ]

93 94

95 - def __mutate (self, value, do_add):

96 # Identify the start (inclusive) and end (exclusive) code 97 # points of the value's range. 98 if isinstance(value, tuple): 99 (s, e) = value 100 e += 1 101 elif isinstance(value, basestring): 102 if 1 < len(value): 103 raise TypeError() 104 s = ord(value) 105 e = s+1 106 else: 107 s = int(value) 108 e = s+1 109 if s > e: 110 raise ValueError('codepoint range value order') 111 112 # Validate the range for the code points supported by this 113 # Python interpreter 114 if s > self.MaxCodePoint: 115 return self 116 if e > self.MaxCodePoint: 117 e = self.MaxCodePoint 118 e = min(e, self.MaxCodePoint) 119 120 # Index of first code point equal to or greater than s 121 li = bisect.bisect_left(self.__codepoints, s) 122 # Index of last code point less than or equal to e 123 ri = bisect.bisect_right(self.__codepoints, e) 124 # There are four cases; if we're subtracting, they reflect. 125 case = ((li & 1) << 1) | (ri & 1) 126 if not do_add: 127 case = 3 - case 128 #print 'add %d %d to %s at %d %d' % (s, e, self.__codepoints, li, ri) 129 if 0x03 == case: 130 # Add: Incoming value begins and ends within existing ranges 131 del self.__codepoints[li:ri] 132 elif 0x02 == case: 133 # Add: Incoming value extends into an excluded range 134 del self.__codepoints[li+1:ri] 135 self.__codepoints[li] = e 136 elif 0x01 == case: 137 # Add: Incoming value begins in an excluded range 138 del self.__codepoints[li+1:ri] 139 self.__codepoints[li] = s 140 else: 141 # Add: Incoming value begins and ends within excluded ranges 142 self.__codepoints[li:ri] = [s, e] 143 return self

144

145 - def add (self, value):

146 """Add the given value to the code point set. 147 148 @param value: An integral value denoting a code point, or a 149 tuple C{(s,e)} denoting the start and end (inclusive) code 150 points in a range. 151 @return: C{self}""" 152 return self.__mutate(value, True)

153

154 - def extend (self, values):

155 """Add multiple values to a code point set. 156 157 @param values: Either a L{CodePointSet} instance, or an iterable 158 whose members are valid parameters to L{add}. 159 160 @return: C{self}""" 161 if isinstance(values, CodePointSet): 162 self.extend(values.asTuples()) 163 else: 164 [ self.__mutate(_v, True) for _v in values ] 165 return self

166

167 - def subtract (self, value):

168 """Remove the given value from the code point set. 169 170 @param value: An integral value denoting a code point, or a tuple 171 C{(s,e)} denoting the start and end (inclusive) code points in a 172 range, or a L{CodePointSet}. 173 174 @return: C{self}""" 175 if isinstance(value, CodePointSet): 176 [ self.subtract(_v) for _v in value.asTuples() ] 177 return self 178 return self.__mutate(value, False)

179 180 # Characters that must not appear unescaped in regular expression 181 # patterns 182 __NotXMLChar_set = frozenset([ '-', '[', ']' ]) 183 184 # Return the given code point as a unicode character suitable for 185 # use in a regular expression

186 - def __unichr (self, code_point):

187 rv = unichr(code_point) 188 if rv in self.__NotXMLChar_set: 189 rv = u'\\' + rv 190 return rv

191

192 - def asPattern (self, with_brackets=True):

193 """Return the code point set as Unicode regular expression 194 character group consisting of a sequence of characters or 195 character ranges. 196 197 @param with_brackets: If C{True} (default), square brackets 198 are added to enclose the returned character group.""" 199 rva = [] 200 if with_brackets: 201 rva.append(u'[') 202 for (s, e) in self.asTuples(): 203 if s == e: 204 rva.append(self.__unichr(s)) 205 else: 206 rva.extend([self.__unichr(s), '-', self.__unichr(e)]) 207 if with_brackets: 208 rva.append(u']') 209 return u''.join(rva)

210

211 - def asTuples (self):

212 """Return the codepoints as tuples denoting the ranges that are in 213 the set. 214 215 Each tuple C{(s, e)} indicates that the code points from C{s} 216 (inclusive) to C{e}) (inclusive) are in the set.""" 217 218 rv = [] 219 start = None 220 for ri in xrange(len(self.__codepoints)): 221 if start is not None: 222 rv.append( (start, self.__codepoints[ri]-1) ) 223 start = None 224 else: 225 start = self.__codepoints[ri] 226 if start is not None: 227 rv.append( (start, self.MaxCodePoint) ) 228 return rv

229

230 - def negate (self):

231 """Return an instance that represents the inverse of this set.""" 232 rv = type(self)() 233 if (0 < len(self.__codepoints)) and (0 == self.__codepoints[0]): 234 rv.__codepoints.extend(self.__codepoints[1:]) 235 else: 236 rv.__codepoints.append(0) 237 rv.__codepoints.extend(self.__codepoints) 238 return rv

239

240 - def asSingleCharacter (self):

241 """If this set represents a single character, return it as its 242 unicode string value.""" 243 if (2 != len(self.__codepoints)) or (1 < (self.__codepoints[1] - self.__codepoints[0])): 244 raise CodePointSetError('CodePointSet does not represent single character') 245 return unichr(self.__codepoints[0])

Source Code for Module pyxb.utils.unicode