1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 """Support for regular expressions conformant to the XML Schema specification.
18
19 For the most part, XML regular expressions are similar to the POSIX
20 ones, and can be handled by the Python C{re} module. The exceptions
21 are for multi-character (C{\w}) and category escapes (e.g., C{\p{N}} or
22 C{\p{IPAExtensions}}) and the character set subtraction capability.
23 This module supports those by scanning the regular expression,
24 replacing the category escapes with equivalent charset expressions.
25 It further detects the subtraction syntax and modifies the charset
26 expression to remove the unwanted code points.
27
28 The basic technique is to step through the characters of the regular
29 expression, entering a recursive-descent parser when one of the
30 translated constructs is encountered.
31
32 There is a nice set of XML regular expressions at
33 U{http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xsd},
34 with a sample document at U{
35 http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xml}"""
36
37 import re
38 import logging
39 import pyxb.utils.unicode
40 from pyxb.utils import six
41
42 _log = logging.getLogger(__name__)
43
44
45
46 _AllEsc = { }
47
64 _InitializeAllEsc()
65
67 """Raised when a regular expression cannot be processed.."""
68 - def __init__ (self, position, description):
69 self.position = position
70 ValueError.__init__(self, 'At %d: %s' % (position, description))
71
72 _CharClassEsc_re = re.compile(r'\\(?:(?P<cgProp>[pP]{(?P<charProp>[-A-Za-z0-9]+)})|(?P<cgClass>[^pP]))')
74 """Parse a U{charClassEsc<http://www.w3.org/TR/xmlschema-2/#nt-charClassEsc>} term.
75
76 This is one of:
77
78 - U{SingleCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-SingleCharEsc>},
79 an escaped single character such as C{E{\}n}
80
81 - U{MultiCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-MultiCharEsc>},
82 an escape code that can match a range of characters,
83 e.g. C{E{\}s} to match certain whitespace characters
84
85 - U{catEsc<http://www.w3.org/TR/xmlschema-2/#nt-catEsc>}, the
86 C{E{\}pE{lb}...E{rb}} Unicode property escapes including
87 categories and blocks
88
89 - U{complEsc<http://www.w3.org/TR/xmlschema-2/#nt-complEsc>},
90 the C{E{\}PE{lb}...E{rb}} inverted Unicode property escapes
91
92 If the parsing fails, throws a RegularExpressionError.
93
94 @return: A pair C{(cps, p)} where C{cps} is a
95 L{pyxb.utils.unicode.CodePointSet} containing the code points
96 associated with the character class, and C{p} is the text offset
97 immediately following the escape sequence.
98
99 @raise RegularExpressionError: if the expression is syntactically
100 invalid.
101 """
102
103 mo = _CharClassEsc_re.match(text, position)
104 if mo:
105 escape_code = mo.group(0)
106 cps = _AllEsc.get(escape_code)
107 if cps is not None:
108 return (cps, mo.end())
109 char_prop = mo.group('charProp')
110 if char_prop is not None:
111 if char_prop.startswith('Is'):
112 raise RegularExpressionError(position, 'Unrecognized Unicode block %s in %s' % (char_prop[2:], escape_code))
113 raise RegularExpressionError(position, 'Unrecognized character property %s' % (escape_code,))
114 raise RegularExpressionError(position, 'Unrecognized character class %s' % (escape_code,))
115 raise RegularExpressionError(position, "Unrecognized escape identifier at %s" % (text[position:],))
116
118 '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term.
119
120 @return: A tuple C{(cps, fs, p)} where:
121 - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group;
122 - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub;
123 - C{p} is the text offset immediately following the closing brace.
124
125 @raise RegularExpressionError: if the expression is syntactically
126 invalid.
127 '''
128
129 start_position = position
130
131
132
133 class DashClass:
134 pass
135 DASH = DashClass()
136
137
138 tokens = []
139 has_following_subtraction = False
140 while True:
141 if position >= len(text):
142 raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'")
143 ch = text[position]
144 if ch == six.u('['):
145
146 if not tokens or tokens[-1] is not DASH:
147 raise RegularExpressionError(position, "'[' character not allowed in character class")
148 has_following_subtraction = True
149
150
151 tokens.pop()
152 position = position - 1
153 break
154 elif ch == six.u(']'):
155
156 break
157 elif ch == six.unichr(0x5c):
158 cps, position = _MatchCharClassEsc(text, position)
159 single_char = cps.asSingleCharacter()
160 if single_char is not None:
161 tokens.append(single_char)
162 else:
163 tokens.append(cps)
164 elif ch == six.u('-'):
165
166
167 tokens.append(DASH)
168 position = position + 1
169 else:
170 tokens.append(ch)
171 position = position + 1
172
173 if not tokens:
174 raise RegularExpressionError(position, "Empty character class not allowed")
175
176
177 if tokens[0] is DASH:
178 tokens[0] = six.u('-')
179 if tokens[-1] is DASH:
180 tokens[-1] = six.u('-')
181 result_cps = pyxb.utils.unicode.CodePointSet()
182 cur_token = 0
183 while cur_token < len(tokens):
184 start = tokens[cur_token]
185 if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH:
186 end = tokens[cur_token + 2]
187 if not isinstance(start, six.text_type) or not isinstance(end, six.text_type):
188 if start is DASH or end is DASH:
189 raise RegularExpressionError(start_position, 'Two dashes in a row is not allowed in the middle of a character class.')
190 raise RegularExpressionError(start_position, 'Dashes must be surrounded by characters, not character class escapes. %r %r' %(start, end))
191 if start > end:
192 raise RegularExpressionError(start_position, 'Character ranges must have the lowest character first')
193 result_cps.add((ord(start), ord(end)))
194 cur_token = cur_token + 3
195 else:
196 if start is DASH:
197 raise RegularExpressionError(start_position, 'Dash without an initial character')
198 elif isinstance(start, six.text_type):
199 result_cps.add(ord(start))
200 else:
201 result_cps.extend(start)
202 cur_token = cur_token + 1
203
204 return result_cps, has_following_subtraction, position
205
207 '''Parse a U{charClassExpr<http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}.
208
209 These are XML regular expression classes such as C{[abc]}, C{[a-c]}, C{[^abc]}, or C{[a-z-[q]]}.
210
211 @param text: The complete text of the regular expression being
212 translated. The first character must be the C{[} starting a
213 character class.
214
215 @param position: The offset of the start of the character group.
216
217 @return: A pair C{(cps, p)} where C{cps} is a
218 L{pyxb.utils.unicode.CodePointSet} containing the code points
219 associated with the property, and C{p} is the text offset
220 immediately following the closing brace.
221
222 @raise RegularExpressionError: if the expression is syntactically
223 invalid.
224 '''
225 if position >= len(text):
226 raise RegularExpressionError(position, 'Missing character class expression')
227 if six.u('[') != text[position]:
228 raise RegularExpressionError(position, "Expected start of character class expression, got '%s'" % (text[position],))
229 position = position + 1
230 if position >= len(text):
231 raise RegularExpressionError(position, 'Missing character class expression')
232 negated = (text[position] == '^')
233 if negated:
234 position = position + 1
235
236 result_cps, has_following_subtraction, position = _MatchPosCharGroup(text, position)
237
238 if negated:
239 result_cps = result_cps.negate()
240
241 if has_following_subtraction:
242 assert text[position] == six.u('-')
243 assert text[position + 1] == six.u('[')
244 position = position + 1
245 sub_cps, position = _MatchCharClassExpr(text, position)
246 result_cps.subtract(sub_cps)
247
248 if position >= len(text) or text[position] != six.u(']'):
249 raise RegularExpressionError(position, "Expected ']' to end character class")
250 return result_cps, position + 1
251
253 """Attempt to match a U{character class expression
254 <http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}.
255
256 @param text: The complete text of the regular expression being
257 translated
258
259 @param position: The offset of the start of the potential
260 expression.
261
262 @return: C{None} if C{position} does not begin a character class
263 expression; otherwise a pair C{(cps, p)} where C{cps} is a
264 L{pyxb.utils.unicode.CodePointSet} containing the code points associated with
265 the property, and C{p} is the text offset immediately following
266 the closing brace."""
267 if position >= len(text):
268 return None
269 c = text[position]
270 np = position + 1
271 if '.' == c:
272 return (pyxb.utils.unicode.WildcardEsc, np)
273 if '[' == c:
274 return _MatchCharClassExpr(text, position)
275 if '\\' == c:
276 return _MatchCharClassEsc(text, position)
277 return None
278
280 """Convert the given pattern to the format required for Python
281 regular expressions.
282
283 @param pattern: A Unicode string defining a pattern consistent
284 with U{XML regular
285 expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}.
286
287 @return: A Unicode string specifying a Python regular expression
288 that matches the same language as C{pattern}."""
289 assert isinstance(pattern, six.text_type)
290 new_pattern_elts = []
291 new_pattern_elts.append('^')
292 position = 0
293 while position < len(pattern):
294 cg = MaybeMatchCharacterClass(pattern, position)
295 if cg is None:
296 ch = pattern[position]
297 if ch == six.u('^') or ch == six.u('$'):
298
299
300
301 new_pattern_elts.append(six.unichr(0x5c) + ch)
302 else:
303 new_pattern_elts.append(ch)
304 position += 1
305 else:
306 (cps, position) = cg
307 new_pattern_elts.append(cps.asPattern())
308 new_pattern_elts.append('$')
309 return ''.join(new_pattern_elts)
310