pyxb.utils.xmlre

1 # -*- coding: utf-8 -*- 2 # Copyright 2009-2012, Peter A. Bigot 3 # Copyright 2012, Jon Foster 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); you may 6 # not use this file except in compliance with the License. You may obtain a 7 # copy of the License at: 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14 # License for the specific language governing permissions and limitations 15 # under the License. 16 17 """Support for regular expressions conformant to the XML Schema specification. 18 19 For the most part, XML regular expressions are similar to the POSIX 20 ones, and can be handled by the Python C{re} module. The exceptions 21 are for multi-character (C{\w}) and category escapes (e.g., C{\p{N}} or 22 C{\p{IPAExtensions}}) and the character set subtraction capability. 23 This module supports those by scanning the regular expression, 24 replacing the category escapes with equivalent charset expressions. 25 It further detects the subtraction syntax and modifies the charset 26 expression to remove the unwanted code points. 27 28 The basic technique is to step through the characters of the regular 29 expression, entering a recursive-descent parser when one of the 30 translated constructs is encountered. 31 32 There is a nice set of XML regular expressions at 33 U{http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xsd}, 34 with a sample document at U{ 35 http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xml}""" 36 37 import pyxb.utils.unicode 38 import re 39 import logging 40 41 _log = logging.getLogger(__name__) 42 43 # AllEsc maps all the possible escape codes and wildcards in an XML schema 44 # regular expression into the corresponding CodePointSet. 45 _AllEsc = { } 46

47 -def _InitializeAllEsc ():

48 """Set the values in _AllEsc without introducing C{k} and C{v} into 49 the module.""" 50 51 _AllEsc.update({ u'.': pyxb.utils.unicode.WildcardEsc }) 52 for k, v in pyxb.utils.unicode.SingleCharEsc.iteritems(): 53 _AllEsc[u'\\' + unicode(k)] = v 54 for k, v in pyxb.utils.unicode.MultiCharEsc.iteritems(): 55 _AllEsc[u'\\' + unicode(k)] = v 56 for k, v in pyxb.utils.unicode.catEsc.iteritems(): 57 _AllEsc[u'\\' + unicode(k)] = v 58 for k, v in pyxb.utils.unicode.complEsc.iteritems(): 59 _AllEsc[u'\\' + unicode(k)] = v 60 for k, v in pyxb.utils.unicode.IsBlockEsc.iteritems(): 61 _AllEsc[u'\\' + unicode(k)] = v

62 _InitializeAllEsc() 63

64 -class RegularExpressionError (ValueError):

65 """Raised when a regular expression cannot be processed.."""

66 - def __init__ (self, position, description):

67 self.position = position 68 ValueError.__init__(self, 'At %d: %s' % (position, description))

69 70 _CharClassEsc_re = re.compile(r'\\(?:(?P<cgProp>[pP]{(?P<charProp>[-A-Za-z0-9]+)})|(?P<cgClass>[^pP]))')

71 -def _MatchCharClassEsc(text, position):

72 """Parse a U{charClassEsc<http://www.w3.org/TR/xmlschema-2/#nt-charClassEsc>} term. 73 74 This is one of: 75 76 - U{SingleCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-SingleCharEsc>}, 77 an escaped single character such as C{E{\}n} 78 79 - U{MultiCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-MultiCharEsc>}, 80 an escape code that can match a range of characters, 81 e.g. C{E{\}s} to match certain whitespace characters 82 83 - U{catEsc<http://www.w3.org/TR/xmlschema-2/#nt-catEsc>}, the 84 C{E{\}pE{lb}...E{rb}} Unicode property escapes including 85 categories and blocks 86 87 - U{complEsc<http://www.w3.org/TR/xmlschema-2/#nt-complEsc>}, 88 the C{E{\}PE{lb}...E{rb}} inverted Unicode property escapes 89 90 If the parsing fails, throws a RegularExpressionError. 91 92 @return: A pair C{(cps, p)} where C{cps} is a 93 L{pyxb.utils.unicode.CodePointSet} containing the code points 94 associated with the character class, and C{p} is the text offset 95 immediately following the escape sequence. 96 97 @raise RegularExpressionError: if the expression is syntactically 98 invalid. 99 """ 100 101 mo = _CharClassEsc_re.match(text, position) 102 if mo: 103 escape_code = mo.group(0) 104 cps = _AllEsc.get(escape_code) 105 if cps is not None: 106 return (cps, mo.end()) 107 char_prop = mo.group('charProp') 108 if char_prop is not None: 109 if char_prop.startswith('Is'): 110 raise RegularExpressionError(position, 'Unrecognized Unicode block %s in %s' % (char_prop[2:], escape_code)) 111 raise RegularExpressionError(position, 'Unrecognized character property %s' % (escape_code,)) 112 raise RegularExpressionError(position, 'Unrecognized character class %s' % (escape_code,)) 113 raise RegularExpressionError(position, "Unrecognized escape identifier at %s" % (text[position:],))

114

115 -def _MatchPosCharGroup(text, position):

116 '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term. 117 118 @return: A tuple C{(cps, fs, p)} where: 119 - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group; 120 - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub; 121 - C{p} is the text offset immediately following the closing brace. 122 123 @raise RegularExpressionError: if the expression is syntactically 124 invalid. 125 ''' 126 127 start_position = position 128 129 # DASH is just some unique object, used as a marker. 130 # It can't be unicode or a CodePointSet. 131 class DashClass: 132 pass

133 DASH = DashClass() 134 135 # We tokenize first, then go back and stick the ranges together. 136 tokens = [] 137 has_following_subtraction = False 138 while True: 139 if position >= len(text): 140 raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'") 141 ch = text[position] 142 if ch == u'[': 143 # Only allowed if this is a subtraction 144 if not tokens or tokens[-1] is not DASH: 145 raise RegularExpressionError(position, "'[' character not allowed in character class") 146 has_following_subtraction = True 147 # For a character class subtraction, the "-[" are not part of the 148 # posCharGroup, so undo reading the dash 149 tokens.pop() 150 position = position - 1 151 break 152 elif ch == u']': 153 # End 154 break 155 elif ch == u'\\': 156 cps, position = _MatchCharClassEsc(text, position) 157 single_char = cps.asSingleCharacter() 158 if single_char is not None: 159 tokens.append(single_char) 160 else: 161 tokens.append(cps) 162 elif ch == u'-': 163 # We need to distinguish between "-" and "\-". So we use 164 # DASH for a plain "-", and u"-" for a "\-". 165 tokens.append(DASH) 166 position = position + 1 167 else: 168 tokens.append(ch) 169 position = position + 1 170 171 if not tokens: 172 raise RegularExpressionError(position, "Empty character class not allowed") 173 174 # At the start or end of the character group, a dash has to be a literal 175 if tokens[0] is DASH: 176 tokens[0] = u'-' 177 if tokens[-1] is DASH: 178 tokens[-1] = u'-' 179 result_cps = pyxb.utils.unicode.CodePointSet() 180 cur_token = 0 181 while cur_token < len(tokens): 182 start = tokens[cur_token] 183 if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH: 184 end = tokens[cur_token + 2] 185 if not isinstance(start, unicode) or not isinstance(end, unicode): 186 if start is DASH or end is DASH: 187 raise RegularExpressionError(start_position, 'Two dashes in a row is not allowed in the middle of a character class.') 188 raise RegularExpressionError(start_position, 'Dashes must be surrounded by characters, not character class escapes. %r %r' %(start, end)) 189 if start > end: 190 raise RegularExpressionError(start_position, 'Character ranges must have the lowest character first') 191 result_cps.add((ord(start), ord(end))) 192 cur_token = cur_token + 3 193 else: 194 if start is DASH: 195 raise RegularExpressionError(start_position, 'Dash without an initial character') 196 elif isinstance(start, unicode): 197 result_cps.add(ord(start)) 198 else: 199 result_cps.extend(start) 200 cur_token = cur_token + 1 201 202 return result_cps, has_following_subtraction, position 203

204 -def _MatchCharClassExpr(text, position):

205 '''Parse a U{charClassExpr<http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}. 206 207 These are XML regular expression classes such as C{[abc]}, C{[a-c]}, C{[^abc]}, or C{[a-z-[q]]}. 208 209 @param text: The complete text of the regular expression being 210 translated. The first character must be the C{[} starting a 211 character class. 212 213 @param position: The offset of the start of the character group. 214 215 @return: A pair C{(cps, p)} where C{cps} is a 216 L{pyxb.utils.unicode.CodePointSet} containing the code points 217 associated with the property, and C{p} is the text offset 218 immediately following the closing brace. 219 220 @raise RegularExpressionError: if the expression is syntactically 221 invalid. 222 ''' 223 if position >= len(text): 224 raise RegularExpressionError(position, 'Missing character class expression') 225 if u'[' != text[position]: 226 raise RegularExpressionError(position, "Expected start of character class expression, got '%s'" % (text[position],)) 227 position = position + 1 228 if position >= len(text): 229 raise RegularExpressionError(position, 'Missing character class expression') 230 negated = (text[position] == '^') 231 if negated: 232 position = position + 1 233 234 result_cps, has_following_subtraction, position = _MatchPosCharGroup(text, position) 235 236 if negated: 237 result_cps = result_cps.negate() 238 239 if has_following_subtraction: 240 assert text[position] == u'-' 241 assert text[position + 1] == u'[' 242 position = position + 1 243 sub_cps, position = _MatchCharClassExpr(text, position) 244 result_cps.subtract(sub_cps) 245 246 if position >= len(text) or text[position] != u']': 247 raise RegularExpressionError(position, "Expected ']' to end character class") 248 return result_cps, position + 1

249

250 -def MaybeMatchCharacterClass (text, position):

251 """Attempt to match a U{character class expression 252 <http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}. 253 254 @param text: The complete text of the regular expression being 255 translated 256 257 @param position: The offset of the start of the potential 258 expression. 259 260 @return: C{None} if C{position} does not begin a character class 261 expression; otherwise a pair C{(cps, p)} where C{cps} is a 262 L{pyxb.utils.unicode.CodePointSet} containing the code points associated with 263 the property, and C{p} is the text offset immediately following 264 the closing brace.""" 265 if position >= len(text): 266 return None 267 c = text[position] 268 np = position + 1 269 if '.' == c: 270 return (pyxb.utils.unicode.WildcardEsc, np) 271 if '[' == c: 272 return _MatchCharClassExpr(text, position) 273 if '\\' == c: 274 return _MatchCharClassEsc(text, position) 275 return None

276

277 -def XMLToPython (pattern):

278 """Convert the given pattern to the format required for Python 279 regular expressions. 280 281 @param pattern: A Unicode string defining a pattern consistent 282 with U{XML regular 283 expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}. 284 285 @return: A Unicode string specifying a Python regular expression 286 that matches the same language as C{pattern}.""" 287 assert isinstance(pattern, unicode) 288 new_pattern_elts = [] 289 new_pattern_elts.append('^') 290 position = 0 291 while position < len(pattern): 292 cg = MaybeMatchCharacterClass(pattern, position) 293 if cg is None: 294 ch = pattern[position] 295 if ch == u'^' or ch == u'$': 296 # These characters have no special meaning in XSD. But they 297 # match start and end of string in Python, so they have to 298 # be escaped. 299 new_pattern_elts.append(u'\\' + ch) 300 else: 301 new_pattern_elts.append(ch) 302 position += 1 303 else: 304 (cps, position) = cg 305 new_pattern_elts.append(cps.asPattern()) 306 new_pattern_elts.append('$') 307 return ''.join(new_pattern_elts)

308

Source Code for Module pyxb.utils.xmlre