1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 """Support for regular expressions conformant to the XML Schema specification.
16
17 For the most part, XML regular expressions are similar to the POSIX
18 ones, and can be handled by the Python C{re} module. The exceptions
19 are for multi-character (C{\w}) and category escapes (e.g., C{\N} or
20 C{\p{IPAExtensions}}) and the character set subtraction capability.
21 This module supports those by scanning the regular expression,
22 replacing the category escapes with equivalent charset expressions.
23 It further detects the subtraction syntax and modifies the charset
24 expression to remove the unwanted code points.
25
26 The basic technique is to step through the characters of the regular
27 expression, entering a recursive-descent parser when one of the
28 translated constructs is encountered.
29
30 There is a nice set of XML regular expressions at
31 U{http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xsd},
32 with a sample document at U{
33 http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xml}"""
34
35 import unicode
36 import re
37
39 """Raised when a regular expression cannot be processed.."""
40 - def __init__ (self, position, description):
41 self.position = position
42 ValueError.__init__(self, 'At %d: %s' % (position, description))
43
45 """Match a U{character property
46 <http://www.w3.org/TR/xmlschema-2/#nt-catEsc>}
47 or U{multi-character escape
48 <http://www.w3.org/TR/xmlschema-2/#nt-MultiCharEsc>} identifier,
49 which will be enclosed in braces.
50
51 @param text: The complete text of the regular expression being
52 translated
53
54 @param position: The offset of the opening brace of the character
55 property
56
57 @return: A pair C{(cps, p)} where C{cps} is a
58 L{unicode.CodePointSet} containing the code points associated with
59 the property, and C{p} is the text offset immediately following
60 the closing brace.
61
62 @raise RegularExpressionError: if opening or closing braces are
63 missing, or if the text between them cannot be recognized as a
64 property or block identifier.
65 """
66 if position >= len(text):
67 raise RegularExpressionError(position, "Missing brace after category escape")
68 if '{' != text[position]:
69 raise RegularExpressionError(position, "Unexpected character '%s' after category escape" % (text[position],))
70 ep = text.find('}', position+1)
71 if 0 > ep:
72 raise RegularExpressionError(position, "Unterminated category")
73 char_prop = text[position+1:ep]
74 if char_prop.startswith('Is'):
75 char_prop = char_prop[2:]
76 cs = unicode.BlockMap.get(char_prop)
77 if cs is None:
78 raise RegularExpressionError(position, "Unrecognized block name '%s'" % (char_prop,))
79 return (cs, ep+1)
80 cs = unicode.PropertyMap.get(char_prop)
81 if cs is None:
82 raise RegularExpressionError(position, "Unrecognized character property '%s'" % (char_prop,))
83 return (cs, ep+1)
84
86 """Attempt to match a U{character class escape
87 <http://www.w3.org/TR/xmlschema-2/#nt-charClassEsc>}
88 expression.
89
90 @param text: The complete text of the regular expression being
91 translated
92
93 @param position: The offset of the backslash that would begin the
94 potential character class escape
95
96 @param include_sce: Optional directive to include single-character
97 escapes in addition to character cllass escapes. Default is
98 C{True}.
99
100 @return: C{None} if C{position} does not begin a character class
101 escape; otherwise a pair C{(cps, p)} as in
102 L{_MatchCharPropBraced}."""
103 if '\\' != text[position]:
104 return None
105 position += 1
106 if position >= len(text):
107 raise RegularExpressionError(position, "Incomplete character escape")
108 nc = text[position]
109 np = position + 1
110 cs = None
111 if include_sce:
112 cs = unicode.SingleCharEsc.get(nc)
113 if cs is None:
114 cs = unicode.MultiCharEsc.get(nc)
115 if cs is not None:
116 return (cs, np)
117 if 'p' == nc:
118 return _MatchCharPropBraced(text, np)
119 if 'P' == nc:
120 (cs, np) = _MatchCharPropBraced(text, np)
121 return (cs.negate(), np)
122 if (not include_sce) and (nc in unicode.SingleCharEsc):
123 return None
124 raise RegularExpressionError(np, "Unrecognized escape identifier '\\%s'" % (nc,))
125
126 _NotXMLChar_set = frozenset([ '-', '[', ']' ])
127 """The set of characters that cannot appear within a character class
128 expression unescaped."""
129
131 """Return the single character represented at the given position.
132
133 @param text: The complete text of the regular expression being
134 translated
135
136 @param position: The offset of the character to return. If this
137 is a backslash, additional text is consumed in order to identify
138 the U{single-character escape <http://www.w3.org/TR/xmlschema-2/#nt-SingleCharEsc>}
139 that begins at the position.
140
141 @return: A pair C{(c, p)} where C{c} is the Unicode character
142 specified at the position, and C{p} is the text offset immediately
143 following the closing brace.
144
145 @raise RegularExpressionError: if the position has no character,
146 or has a character in L{_NotXMLChar_set} or the position begins an
147 escape sequence that is not resolvable as a single-character
148 escape.
149 """
150
151 if position >= len(text):
152 raise RegularExpressionError(position, "Missing character")
153 rc = text[position]
154 position += 1
155 if rc in _NotXMLChar_set:
156 raise RegularExpressionError(position, "Unexpected character '%s'" % (rc,))
157 if '\\' == rc:
158 if position >= len(text):
159 raise RegularExpressionError(position, "Incomplete escape sequence")
160 charset = unicode.SingleCharEsc.get(text[position])
161 if charset is None:
162 raise RegularExpressionError(position-1, "Unrecognized single-character escape '\\%s'" % (text[position],))
163 rc = charset.asSingleCharacter()
164 position += 1
165 return (rc, position)
166
168 """Match a U{positive character
169 group<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>}
170 that begins at the given position.
171
172 @param text: The complete text of the regular expression being
173 translated
174
175 @param position: The offset of the start of the positive character
176 group.
177
178 @return: a pair C{(cps, p)} as in L{_MatchCharPropBraced}.
179
180 @raise RegularExpressionError: if the expression is syntactically
181 invalid.
182 """
183 cps = unicode.CodePointSet()
184 if '-' == text[position]:
185 cps.add(ord('-'))
186 position += 1
187 while position < len(text):
188
189 if '\\' == text[position]:
190 cg = _MaybeMatchCharClassEsc(text, position, include_sce=False)
191 if cg is not None:
192 (charset, position) = cg
193 cps.extend(charset)
194 continue
195 if text[position] in _NotXMLChar_set:
196 break
197 (sc0, np) = _CharOrSCE(text, position)
198 osc0 = ord(sc0)
199 if (np < len(text)) and ('-' == text[np]):
200 np += 1
201 (sc1, np) = _CharOrSCE(text, np)
202 osc1 = ord(sc1)
203 if osc0 > osc1:
204 raise RegularExpressionError(position, 'Character range must be non-decreasing')
205 cps.add( (osc0, osc1) )
206 else:
207 cps.add(osc0)
208 position = np
209
210 return (cps, position)
211
213 """Match a U{character group<http://www.w3.org/TR/xmlschema-2/#nt-charGroup>}
214 at the given position.
215
216 @param text: The complete text of the regular expression being
217 translated
218
219 @param position: The offset of the start of the character group.
220
221 @return: a pair C{(cps, p)} as in L{_MatchCharPropBraced}.
222
223 @raise RegularExpressionError: if the expression is syntactically
224 invalid.
225 """
226
227 if position >= len(text):
228 raise RegularExpressionError(position, 'Expected character group')
229 np = position
230 negative_group = ('^' == text[np])
231 if negative_group:
232 np += 1
233 (cps, np) = _MatchPosCharGroup(text, np)
234 if negative_group:
235 cps = cps.negate()
236 if (np < len(text)) and ('-' == text[np]):
237 (ncps, np) = _MatchCharClassExpr(text, np+1)
238 cps.subtract(ncps)
239 return (cps, np)
240
242 """Match a U{character class expression<http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}
243 at the given position.
244
245 @param text: The complete text of the regular expression being
246 translated
247
248 @param position: The offset of the start of the character group.
249
250 @return: a pair C{(cps, p)} as in L{_MatchCharPropBraced}.
251
252 @raise RegularExpressionError: if the expression is syntactically
253 invalid.
254 """
255 if position >= len(text):
256 raise RegularExpressionError(position, 'Missing character class expression')
257 nc = text[position]
258 np = position + 1
259 if '[' != nc:
260 raise RegularExpressionError(position, "Expected start of character class expression, got '%s'" % (nc,))
261 (cps, np) = _MatchCharGroup(text, np)
262 if np >= len(text):
263 raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'")
264 if ']' != text[np]:
265 raise RegularExpressionError(position, "Bad character class expression, ends with '%s'" % (text[np],))
266 if 1 == (np - position):
267 raise RegularExpressionError(position, "Empty character class not allowed")
268 return (cps, np+1)
269
271 """Attempt to match a U{character class expression
272 <http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}.
273
274 @param text: The complete text of the regular expression being
275 translated
276
277 @param position: The offset of the start of the potential
278 expression.
279
280 @return: C{None} if C{position} does not begin a character class
281 expression; otherwise a pair C{(cps, p)} as in
282 L{_MatchCharPropBraced}."""
283 if position >= len(text):
284 return None
285 c = text[position]
286 np = position + 1
287 if '.' == c:
288 return (unicode.WildcardEsc, np)
289 if '[' == c:
290 return _MatchCharClassExpr(text, position)
291 return _MaybeMatchCharClassEsc(text, position)
292
294 """Convert the given pattern to the format required for Python
295 regular expressions.
296
297 @param pattern: A Unicode string defining a pattern consistent
298 with U{XML regular
299 expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}.
300
301 @return: A Unicode string specifying a Python regular expression
302 that matches the same language as C{pattern}."""
303 new_pattern_elts = []
304 new_pattern_elts.append('^')
305 position = 0
306 while position < len(pattern):
307 cg = MaybeMatchCharacterClass(pattern, position)
308 if cg is None:
309 new_pattern_elts.append(pattern[position])
310 position += 1
311 else:
312 (cps, position) = cg
313 new_pattern_elts.append(cps.asPattern())
314 new_pattern_elts.append('$')
315 return ''.join(new_pattern_elts)
316