pyxb.utils.xmlre

1 # Copyright 2009, Peter A. Bigot 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); you may 4 # not use this file except in compliance with the License. You may obtain a 5 # copy of the License at: 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 # License for the specific language governing permissions and limitations 13 # under the License. 14 15 """Support for regular expressions conformant to the XML Schema specification. 16 17 For the most part, XML regular expressions are similar to the POSIX 18 ones, and can be handled by the Python C{re} module. The exceptions 19 are for multi-character (C{\w}) and category escapes (e.g., C{\N} or 20 C{\p{IPAExtensions}}) and the character set subtraction capability. 21 This module supports those by scanning the regular expression, 22 replacing the category escapes with equivalent charset expressions. 23 It further detects the subtraction syntax and modifies the charset 24 expression to remove the unwanted code points. 25 26 The basic technique is to step through the characters of the regular 27 expression, entering a recursive-descent parser when one of the 28 translated constructs is encountered. 29 30 There is a nice set of XML regular expressions at 31 U{http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xsd}, 32 with a sample document at U{ 33 http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xml}""" 34 35 import unicode 36 import re 37

38 -class RegularExpressionError (ValueError):

39 """Raised when a regular expression cannot be processed.."""

40 - def __init__ (self, position, description):

41 self.position = position 42 ValueError.__init__(self, 'At %d: %s' % (position, description))

43

44 -def _MatchCharPropBraced (text, position):

45 """Match a U{character property 46 <http://www.w3.org/TR/xmlschema-2/#nt-catEsc>} 47 or U{multi-character escape 48 <http://www.w3.org/TR/xmlschema-2/#nt-MultiCharEsc>} identifier, 49 which will be enclosed in braces. 50 51 @param text: The complete text of the regular expression being 52 translated 53 54 @param position: The offset of the opening brace of the character 55 property 56 57 @return: A pair C{(cps, p)} where C{cps} is a 58 L{unicode.CodePointSet} containing the code points associated with 59 the property, and C{p} is the text offset immediately following 60 the closing brace. 61 62 @raise RegularExpressionError: if opening or closing braces are 63 missing, or if the text between them cannot be recognized as a 64 property or block identifier. 65 """ 66 if position >= len(text): 67 raise RegularExpressionError(position, "Missing brace after category escape") 68 if '{' != text[position]: 69 raise RegularExpressionError(position, "Unexpected character '%s' after category escape" % (text[position],)) 70 ep = text.find('}', position+1) 71 if 0 > ep: 72 raise RegularExpressionError(position, "Unterminated category") 73 char_prop = text[position+1:ep] 74 if char_prop.startswith('Is'): 75 char_prop = char_prop[2:] 76 cs = unicode.BlockMap.get(char_prop) 77 if cs is None: 78 raise RegularExpressionError(position, "Unrecognized block name '%s'" % (char_prop,)) 79 return (cs, ep+1) 80 cs = unicode.PropertyMap.get(char_prop) 81 if cs is None: 82 raise RegularExpressionError(position, "Unrecognized character property '%s'" % (char_prop,)) 83 return (cs, ep+1)

84

85 -def _MaybeMatchCharClassEsc (text, position, include_sce=True):

86 """Attempt to match a U{character class escape 87 <http://www.w3.org/TR/xmlschema-2/#nt-charClassEsc>} 88 expression. 89 90 @param text: The complete text of the regular expression being 91 translated 92 93 @param position: The offset of the backslash that would begin the 94 potential character class escape 95 96 @param include_sce: Optional directive to include single-character 97 escapes in addition to character cllass escapes. Default is 98 C{True}. 99 100 @return: C{None} if C{position} does not begin a character class 101 escape; otherwise a pair C{(cps, p)} as in 102 L{_MatchCharPropBraced}.""" 103 if '\\' != text[position]: 104 return None 105 position += 1 106 if position >= len(text): 107 raise RegularExpressionError(position, "Incomplete character escape") 108 nc = text[position] 109 np = position + 1 110 cs = None 111 if include_sce: 112 cs = unicode.SingleCharEsc.get(nc) 113 if cs is None: 114 cs = unicode.MultiCharEsc.get(nc) 115 if cs is not None: 116 return (cs, np) 117 if 'p' == nc: 118 return _MatchCharPropBraced(text, np) 119 if 'P' == nc: 120 (cs, np) = _MatchCharPropBraced(text, np) 121 return (cs.negate(), np) 122 if (not include_sce) and (nc in unicode.SingleCharEsc): 123 return None 124 raise RegularExpressionError(np, "Unrecognized escape identifier '\\%s'" % (nc,))

125 126 _NotXMLChar_set = frozenset([ '-', '[', ']' ]) 127 """The set of characters that cannot appear within a character class 128 expression unescaped.""" 129

130 -def _CharOrSCE (text, position):

131 """Return the single character represented at the given position. 132 133 @param text: The complete text of the regular expression being 134 translated 135 136 @param position: The offset of the character to return. If this 137 is a backslash, additional text is consumed in order to identify 138 the U{single-character escape <http://www.w3.org/TR/xmlschema-2/#nt-SingleCharEsc>} 139 that begins at the position. 140 141 @return: A pair C{(c, p)} where C{c} is the Unicode character 142 specified at the position, and C{p} is the text offset immediately 143 following the closing brace. 144 145 @raise RegularExpressionError: if the position has no character, 146 or has a character in L{_NotXMLChar_set} or the position begins an 147 escape sequence that is not resolvable as a single-character 148 escape. 149 """ 150 151 if position >= len(text): 152 raise RegularExpressionError(position, "Missing character") 153 rc = text[position] 154 position += 1 155 if rc in _NotXMLChar_set: 156 raise RegularExpressionError(position, "Unexpected character '%s'" % (rc,)) 157 if '\\' == rc: 158 if position >= len(text): 159 raise RegularExpressionError(position, "Incomplete escape sequence") 160 charset = unicode.SingleCharEsc.get(text[position]) 161 if charset is None: 162 raise RegularExpressionError(position-1, "Unrecognized single-character escape '\\%s'" % (text[position],)) 163 rc = charset.asSingleCharacter() 164 position += 1 165 return (rc, position)

166

167 -def _MatchPosCharGroup (text, position):

168 """Match a U{positive character 169 group<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} 170 that begins at the given position. 171 172 @param text: The complete text of the regular expression being 173 translated 174 175 @param position: The offset of the start of the positive character 176 group. 177 178 @return: a pair C{(cps, p)} as in L{_MatchCharPropBraced}. 179 180 @raise RegularExpressionError: if the expression is syntactically 181 invalid. 182 """ 183 cps = unicode.CodePointSet() 184 if '-' == text[position]: 185 cps.add(ord('-')) 186 position += 1 187 while position < len(text): 188 # NB: This is not ideal, as we have to hack around matching SCEs 189 if '\\' == text[position]: 190 cg = _MaybeMatchCharClassEsc(text, position, include_sce=False) 191 if cg is not None: 192 (charset, position) = cg 193 cps.extend(charset) 194 continue 195 if text[position] in _NotXMLChar_set: 196 break 197 (sc0, np) = _CharOrSCE(text, position) 198 osc0 = ord(sc0) 199 if (np < len(text)) and ('-' == text[np]): 200 np += 1 201 (sc1, np) = _CharOrSCE(text, np) 202 osc1 = ord(sc1) 203 if osc0 > osc1: 204 raise RegularExpressionError(position, 'Character range must be non-decreasing') 205 cps.add( (osc0, osc1) ) 206 else: 207 cps.add(osc0) 208 position = np 209 210 return (cps, position)

211

212 -def _MatchCharGroup (text, position):

213 """Match a U{character group<http://www.w3.org/TR/xmlschema-2/#nt-charGroup>} 214 at the given position. 215 216 @param text: The complete text of the regular expression being 217 translated 218 219 @param position: The offset of the start of the character group. 220 221 @return: a pair C{(cps, p)} as in L{_MatchCharPropBraced}. 222 223 @raise RegularExpressionError: if the expression is syntactically 224 invalid. 225 """ 226 227 if position >= len(text): 228 raise RegularExpressionError(position, 'Expected character group') 229 np = position 230 negative_group = ('^' == text[np]) 231 if negative_group: 232 np += 1 233 (cps, np) = _MatchPosCharGroup(text, np) 234 if negative_group: 235 cps = cps.negate() 236 if (np < len(text)) and ('-' == text[np]): 237 (ncps, np) = _MatchCharClassExpr(text, np+1) 238 cps.subtract(ncps) 239 return (cps, np)

240

241 -def _MatchCharClassExpr (text, position):

242 """Match a U{character class expression<http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>} 243 at the given position. 244 245 @param text: The complete text of the regular expression being 246 translated 247 248 @param position: The offset of the start of the character group. 249 250 @return: a pair C{(cps, p)} as in L{_MatchCharPropBraced}. 251 252 @raise RegularExpressionError: if the expression is syntactically 253 invalid. 254 """ 255 if position >= len(text): 256 raise RegularExpressionError(position, 'Missing character class expression') 257 nc = text[position] 258 np = position + 1 259 if '[' != nc: 260 raise RegularExpressionError(position, "Expected start of character class expression, got '%s'" % (nc,)) 261 (cps, np) = _MatchCharGroup(text, np) 262 if np >= len(text): 263 raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'") 264 if ']' != text[np]: 265 raise RegularExpressionError(position, "Bad character class expression, ends with '%s'" % (text[np],)) 266 if 1 == (np - position): 267 raise RegularExpressionError(position, "Empty character class not allowed") 268 return (cps, np+1)

269

270 -def MaybeMatchCharacterClass (text, position):

271 """Attempt to match a U{character class expression 272 <http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}. 273 274 @param text: The complete text of the regular expression being 275 translated 276 277 @param position: The offset of the start of the potential 278 expression. 279 280 @return: C{None} if C{position} does not begin a character class 281 expression; otherwise a pair C{(cps, p)} as in 282 L{_MatchCharPropBraced}.""" 283 if position >= len(text): 284 return None 285 c = text[position] 286 np = position + 1 287 if '.' == c: 288 return (unicode.WildcardEsc, np) 289 if '[' == c: 290 return _MatchCharClassExpr(text, position) 291 return _MaybeMatchCharClassEsc(text, position)

292

293 -def XMLToPython (pattern):

294 """Convert the given pattern to the format required for Python 295 regular expressions. 296 297 @param pattern: A Unicode string defining a pattern consistent 298 with U{XML regular 299 expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}. 300 301 @return: A Unicode string specifying a Python regular expression 302 that matches the same language as C{pattern}.""" 303 new_pattern_elts = [] 304 new_pattern_elts.append('^') 305 position = 0 306 while position < len(pattern): 307 cg = MaybeMatchCharacterClass(pattern, position) 308 if cg is None: 309 new_pattern_elts.append(pattern[position]) 310 position += 1 311 else: 312 (cps, position) = cg 313 new_pattern_elts.append(cps.asPattern()) 314 new_pattern_elts.append('$') 315 return ''.join(new_pattern_elts)

316

Source Code for Module pyxb.utils.xmlre