1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 """Support for regular expressions conformant to the XML Schema specification.
18
19 For the most part, XML regular expressions are similar to the POSIX
20 ones, and can be handled by the Python C{re} module. The exceptions
21 are for multi-character (C{\w}) and category escapes (e.g., C{\p{N}} or
22 C{\p{IPAExtensions}}) and the character set subtraction capability.
23 This module supports those by scanning the regular expression,
24 replacing the category escapes with equivalent charset expressions.
25 It further detects the subtraction syntax and modifies the charset
26 expression to remove the unwanted code points.
27
28 The basic technique is to step through the characters of the regular
29 expression, entering a recursive-descent parser when one of the
30 translated constructs is encountered.
31
32 There is a nice set of XML regular expressions at
33 U{http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xsd},
34 with a sample document at U{
35 http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xml}"""
36
37 import pyxb.utils.unicode
38 import re
39 import logging
40
41 _log = logging.getLogger(__name__)
42
43
44
45 _AllEsc = { }
46
62 _InitializeAllEsc()
63
65 """Raised when a regular expression cannot be processed.."""
66 - def __init__ (self, position, description):
67 self.position = position
68 ValueError.__init__(self, 'At %d: %s' % (position, description))
69
70 _CharClassEsc_re = re.compile(r'\\(?:(?P<cgProp>[pP]{(?P<charProp>[-A-Za-z0-9]+)})|(?P<cgClass>[^pP]))')
72 """Parse a U{charClassEsc<http://www.w3.org/TR/xmlschema-2/#nt-charClassEsc>} term.
73
74 This is one of:
75
76 - U{SingleCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-SingleCharEsc>},
77 an escaped single character such as C{E{\}n}
78
79 - U{MultiCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-MultiCharEsc>},
80 an escape code that can match a range of characters,
81 e.g. C{E{\}s} to match certain whitespace characters
82
83 - U{catEsc<http://www.w3.org/TR/xmlschema-2/#nt-catEsc>}, the
84 C{E{\}pE{lb}...E{rb}} Unicode property escapes including
85 categories and blocks
86
87 - U{complEsc<http://www.w3.org/TR/xmlschema-2/#nt-complEsc>},
88 the C{E{\}PE{lb}...E{rb}} inverted Unicode property escapes
89
90 If the parsing fails, throws a RegularExpressionError.
91
92 @return: A pair C{(cps, p)} where C{cps} is a
93 L{pyxb.utils.unicode.CodePointSet} containing the code points
94 associated with the character class, and C{p} is the text offset
95 immediately following the escape sequence.
96
97 @raise RegularExpressionError: if the expression is syntactically
98 invalid.
99 """
100
101 mo = _CharClassEsc_re.match(text, position)
102 if mo:
103 escape_code = mo.group(0)
104 cps = _AllEsc.get(escape_code)
105 if cps is not None:
106 return (cps, mo.end())
107 char_prop = mo.group('charProp')
108 if char_prop is not None:
109 if char_prop.startswith('Is'):
110 raise RegularExpressionError(position, 'Unrecognized Unicode block %s in %s' % (char_prop[2:], escape_code))
111 raise RegularExpressionError(position, 'Unrecognized character property %s' % (escape_code,))
112 raise RegularExpressionError(position, 'Unrecognized character class %s' % (escape_code,))
113 raise RegularExpressionError(position, "Unrecognized escape identifier at %s" % (text[position:],))
114
116 '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term.
117
118 @return: A tuple C{(cps, fs, p)} where:
119 - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group;
120 - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub;
121 - C{p} is the text offset immediately following the closing brace.
122
123 @raise RegularExpressionError: if the expression is syntactically
124 invalid.
125 '''
126
127 start_position = position
128
129
130
131 class DashClass:
132 pass
133 DASH = DashClass()
134
135
136 tokens = []
137 has_following_subtraction = False
138 while True:
139 if position >= len(text):
140 raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'")
141 ch = text[position]
142 if ch == u'[':
143
144 if not tokens or tokens[-1] is not DASH:
145 raise RegularExpressionError(position, "'[' character not allowed in character class")
146 has_following_subtraction = True
147
148
149 tokens.pop()
150 position = position - 1
151 break
152 elif ch == u']':
153
154 break
155 elif ch == u'\\':
156 cps, position = _MatchCharClassEsc(text, position)
157 single_char = cps.asSingleCharacter()
158 if single_char is not None:
159 tokens.append(single_char)
160 else:
161 tokens.append(cps)
162 elif ch == u'-':
163
164
165 tokens.append(DASH)
166 position = position + 1
167 else:
168 tokens.append(ch)
169 position = position + 1
170
171 if not tokens:
172 raise RegularExpressionError(position, "Empty character class not allowed")
173
174
175 if tokens[0] is DASH:
176 tokens[0] = u'-'
177 if tokens[-1] is DASH:
178 tokens[-1] = u'-'
179 result_cps = pyxb.utils.unicode.CodePointSet()
180 cur_token = 0
181 while cur_token < len(tokens):
182 start = tokens[cur_token]
183 if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH:
184 end = tokens[cur_token + 2]
185 if not isinstance(start, unicode) or not isinstance(end, unicode):
186 if start is DASH or end is DASH:
187 raise RegularExpressionError(start_position, 'Two dashes in a row is not allowed in the middle of a character class.')
188 raise RegularExpressionError(start_position, 'Dashes must be surrounded by characters, not character class escapes. %r %r' %(start, end))
189 if start > end:
190 raise RegularExpressionError(start_position, 'Character ranges must have the lowest character first')
191 result_cps.add((ord(start), ord(end)))
192 cur_token = cur_token + 3
193 else:
194 if start is DASH:
195 raise RegularExpressionError(start_position, 'Dash without an initial character')
196 elif isinstance(start, unicode):
197 result_cps.add(ord(start))
198 else:
199 result_cps.extend(start)
200 cur_token = cur_token + 1
201
202 return result_cps, has_following_subtraction, position
203
205 '''Parse a U{charClassExpr<http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}.
206
207 These are XML regular expression classes such as C{[abc]}, C{[a-c]}, C{[^abc]}, or C{[a-z-[q]]}.
208
209 @param text: The complete text of the regular expression being
210 translated. The first character must be the C{[} starting a
211 character class.
212
213 @param position: The offset of the start of the character group.
214
215 @return: A pair C{(cps, p)} where C{cps} is a
216 L{pyxb.utils.unicode.CodePointSet} containing the code points
217 associated with the property, and C{p} is the text offset
218 immediately following the closing brace.
219
220 @raise RegularExpressionError: if the expression is syntactically
221 invalid.
222 '''
223 if position >= len(text):
224 raise RegularExpressionError(position, 'Missing character class expression')
225 if u'[' != text[position]:
226 raise RegularExpressionError(position, "Expected start of character class expression, got '%s'" % (text[position],))
227 position = position + 1
228 if position >= len(text):
229 raise RegularExpressionError(position, 'Missing character class expression')
230 negated = (text[position] == '^')
231 if negated:
232 position = position + 1
233
234 result_cps, has_following_subtraction, position = _MatchPosCharGroup(text, position)
235
236 if negated:
237 result_cps = result_cps.negate()
238
239 if has_following_subtraction:
240 assert text[position] == u'-'
241 assert text[position + 1] == u'['
242 position = position + 1
243 sub_cps, position = _MatchCharClassExpr(text, position)
244 result_cps.subtract(sub_cps)
245
246 if position >= len(text) or text[position] != u']':
247 raise RegularExpressionError(position, "Expected ']' to end character class")
248 return result_cps, position + 1
249
251 """Attempt to match a U{character class expression
252 <http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}.
253
254 @param text: The complete text of the regular expression being
255 translated
256
257 @param position: The offset of the start of the potential
258 expression.
259
260 @return: C{None} if C{position} does not begin a character class
261 expression; otherwise a pair C{(cps, p)} where C{cps} is a
262 L{pyxb.utils.unicode.CodePointSet} containing the code points associated with
263 the property, and C{p} is the text offset immediately following
264 the closing brace."""
265 if position >= len(text):
266 return None
267 c = text[position]
268 np = position + 1
269 if '.' == c:
270 return (pyxb.utils.unicode.WildcardEsc, np)
271 if '[' == c:
272 return _MatchCharClassExpr(text, position)
273 if '\\' == c:
274 return _MatchCharClassEsc(text, position)
275 return None
276
278 """Convert the given pattern to the format required for Python
279 regular expressions.
280
281 @param pattern: A Unicode string defining a pattern consistent
282 with U{XML regular
283 expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}.
284
285 @return: A Unicode string specifying a Python regular expression
286 that matches the same language as C{pattern}."""
287 assert isinstance(pattern, unicode)
288 new_pattern_elts = []
289 new_pattern_elts.append('^')
290 position = 0
291 while position < len(pattern):
292 cg = MaybeMatchCharacterClass(pattern, position)
293 if cg is None:
294 ch = pattern[position]
295 if ch == u'^' or ch == u'$':
296
297
298
299 new_pattern_elts.append(u'\\' + ch)
300 else:
301 new_pattern_elts.append(ch)
302 position += 1
303 else:
304 (cps, position) = cg
305 new_pattern_elts.append(cps.asPattern())
306 new_pattern_elts.append('$')
307 return ''.join(new_pattern_elts)
308