Skip to content

Commit 9da2e04

Browse files
authored
Update html* from 3.13.7 (RustPython#6133)
1 parent 1d53e0c commit 9da2e04

File tree

2 files changed

+437
-167
lines changed

2 files changed

+437
-167
lines changed

Lib/html/parser.py

Lines changed: 129 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,48 @@
2727
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
2828

2929
starttagopen = re.compile('<[a-zA-Z]')
30+
endtagopen = re.compile('</[a-zA-Z]')
3031
piclose = re.compile('>')
31-
commentclose = re.compile(r'--\s*>')
32+
commentclose = re.compile(r'--!?>')
33+
commentabruptclose = re.compile(r'-?>')
3234
# Note:
33-
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
34-
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
35+
# 1) if you change tagfind/attrfind remember to update locatetagend too;
36+
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
3537
# explode, so don't do it.
36-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
37-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
38-
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
39-
attrfind_tolerant = re.compile(
40-
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
41-
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
38+
# see the HTML5 specs section "13.2.5.6 Tag open state",
39+
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
40+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
41+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
42+
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
43+
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
44+
attrfind_tolerant = re.compile(r"""
45+
(
46+
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
47+
)
48+
([\t\n\r\f ]*=[\t\n\r\f ]* # value indicator
49+
('[^']*' # LITA-enclosed value
50+
|"[^"]*" # LIT-enclosed value
51+
|(?!['"])[^>\t\n\r\f ]* # bare value
52+
)
53+
)?
54+
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
55+
""", re.VERBOSE)
56+
locatetagend = re.compile(r"""
57+
[a-zA-Z][^\t\n\r\f />]* # tag name
58+
[\t\n\r\f /]* # optional whitespace before attribute name
59+
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
60+
(?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator
61+
(?:'[^']*' # LITA-enclosed value
62+
|"[^"]*" # LIT-enclosed value
63+
|(?!['"])[^>\t\n\r\f ]* # bare value
64+
)
65+
)?
66+
[\t\n\r\f /]* # possibly followed by a space
67+
)*
68+
>?
69+
""", re.VERBOSE)
70+
# The following variables are not used, but are temporarily left for
71+
# backward compatibility.
4272
locatestarttagend_tolerant = re.compile(r"""
4373
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4474
(?:[\s/]* # optional whitespace before attribute name
@@ -55,8 +85,6 @@
5585
\s* # trailing whitespace
5686
""", re.VERBOSE)
5787
endendtag = re.compile('>')
58-
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
59-
# </ and the tag name, so maybe this should be fixed
6088
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
6189

6290
# Character reference processing logic specific to attribute values
@@ -100,6 +128,7 @@ class HTMLParser(_markupbase.ParserBase):
100128
"""
101129

102130
CDATA_CONTENT_ELEMENTS = ("script", "style")
131+
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
103132

104133
def __init__(self, *, convert_charrefs=True):
105134
"""Initialize and reset this instance.
@@ -117,6 +146,7 @@ def reset(self):
117146
self.lasttag = '???'
118147
self.interesting = interesting_normal
119148
self.cdata_elem = None
149+
self._escapable = True
120150
super().reset()
121151

122152
def feed(self, data):
@@ -138,13 +168,20 @@ def get_starttag_text(self):
138168
"""Return full source of start tag: '<...>'."""
139169
return self.__starttag_text
140170

141-
def set_cdata_mode(self, elem):
171+
def set_cdata_mode(self, elem, *, escapable=False):
142172
self.cdata_elem = elem.lower()
143-
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
173+
self._escapable = escapable
174+
if escapable and not self.convert_charrefs:
175+
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
176+
re.IGNORECASE|re.ASCII)
177+
else:
178+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
179+
re.IGNORECASE|re.ASCII)
144180

145181
def clear_cdata_mode(self):
146182
self.interesting = interesting_normal
147183
self.cdata_elem = None
184+
self._escapable = True
148185

149186
# Internal -- handle data as far as reasonable. May leave state
150187
# and data to be processed by a subsequent call. If 'end' is
@@ -165,7 +202,7 @@ def goahead(self, end):
165202
# & near the end and see if it's followed by a space or ;.
166203
amppos = rawdata.rfind('&', max(i, n-34))
167204
if (amppos >= 0 and
168-
not re.compile(r'[\s;]').search(rawdata, amppos)):
205+
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
169206
break # wait till we get all the text
170207
j = n
171208
else:
@@ -177,7 +214,7 @@ def goahead(self, end):
177214
break
178215
j = n
179216
if i < j:
180-
if self.convert_charrefs and not self.cdata_elem:
217+
if self.convert_charrefs and self._escapable:
181218
self.handle_data(unescape(rawdata[i:j]))
182219
else:
183220
self.handle_data(rawdata[i:j])
@@ -195,25 +232,43 @@ def goahead(self, end):
195232
k = self.parse_pi(i)
196233
elif startswith("<!", i):
197234
k = self.parse_html_declaration(i)
198-
elif (i + 1) < n:
235+
elif (i + 1) < n or end:
199236
self.handle_data("<")
200237
k = i + 1
201238
else:
202239
break
203240
if k < 0:
204241
if not end:
205242
break
206-
k = rawdata.find('>', i + 1)
207-
if k < 0:
208-
k = rawdata.find('<', i + 1)
209-
if k < 0:
210-
k = i + 1
211-
else:
212-
k += 1
213-
if self.convert_charrefs and not self.cdata_elem:
214-
self.handle_data(unescape(rawdata[i:k]))
243+
if starttagopen.match(rawdata, i): # < + letter
244+
pass
245+
elif startswith("</", i):
246+
if i + 2 == n:
247+
self.handle_data("</")
248+
elif endtagopen.match(rawdata, i): # </ + letter
249+
pass
250+
else:
251+
# bogus comment
252+
self.handle_comment(rawdata[i+2:])
253+
elif startswith("<!--", i):
254+
j = n
255+
for suffix in ("--!", "--", "-"):
256+
if rawdata.endswith(suffix, i+4):
257+
j -= len(suffix)
258+
break
259+
self.handle_comment(rawdata[i+4:j])
260+
elif startswith("<![CDATA[", i):
261+
self.unknown_decl(rawdata[i+3:])
262+
elif rawdata[i:i+9].lower() == '<!doctype':
263+
self.handle_decl(rawdata[i+2:])
264+
elif startswith("<!", i):
265+
# bogus comment
266+
self.handle_comment(rawdata[i+2:])
267+
elif startswith("<?", i):
268+
self.handle_pi(rawdata[i+2:])
215269
else:
216-
self.handle_data(rawdata[i:k])
270+
raise AssertionError("we should not get here!")
271+
k = n
217272
i = self.updatepos(i, k)
218273
elif startswith("&#", i):
219274
match = charref.match(rawdata, i)
@@ -261,7 +316,7 @@ def goahead(self, end):
261316
assert 0, "interesting.search() lied"
262317
# end while
263318
if end and i < n:
264-
if self.convert_charrefs and not self.cdata_elem:
319+
if self.convert_charrefs and self._escapable:
265320
self.handle_data(unescape(rawdata[i:n]))
266321
else:
267322
self.handle_data(rawdata[i:n])
@@ -290,8 +345,23 @@ def parse_html_declaration(self, i):
290345
else:
291346
return self.parse_bogus_comment(i)
292347

348+
# Internal -- parse comment, return length or -1 if not terminated
349+
# see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
350+
def parse_comment(self, i, report=True):
351+
rawdata = self.rawdata
352+
assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
353+
match = commentclose.search(rawdata, i+4)
354+
if not match:
355+
match = commentabruptclose.match(rawdata, i+4)
356+
if not match:
357+
return -1
358+
if report:
359+
j = match.start()
360+
self.handle_comment(rawdata[i+4: j])
361+
return match.end()
362+
293363
# Internal -- parse bogus comment, return length or -1 if not terminated
294-
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
364+
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
295365
def parse_bogus_comment(self, i, report=1):
296366
rawdata = self.rawdata
297367
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
@@ -317,6 +387,8 @@ def parse_pi(self, i):
317387

318388
# Internal -- handle starttag, return end or -1 if not terminated
319389
def parse_starttag(self, i):
390+
# See the HTML5 specs section "13.2.5.8 Tag name state"
391+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
320392
self.__starttag_text = None
321393
endpos = self.check_for_whole_start_tag(i)
322394
if endpos < 0:
@@ -356,82 +428,50 @@ def parse_starttag(self, i):
356428
self.handle_starttag(tag, attrs)
357429
if tag in self.CDATA_CONTENT_ELEMENTS:
358430
self.set_cdata_mode(tag)
431+
elif tag in self.RCDATA_CONTENT_ELEMENTS:
432+
self.set_cdata_mode(tag, escapable=True)
359433
return endpos
360434

361435
# Internal -- check to see if we have a complete starttag; return end
362436
# or -1 if incomplete.
363437
def check_for_whole_start_tag(self, i):
364438
rawdata = self.rawdata
365-
m = locatestarttagend_tolerant.match(rawdata, i)
366-
if m:
367-
j = m.end()
368-
next = rawdata[j:j+1]
369-
if next == ">":
370-
return j + 1
371-
if next == "/":
372-
if rawdata.startswith("/>", j):
373-
return j + 2
374-
if rawdata.startswith("/", j):
375-
# buffer boundary
376-
return -1
377-
# else bogus input
378-
if j > i:
379-
return j
380-
else:
381-
return i + 1
382-
if next == "":
383-
# end of input
384-
return -1
385-
if next in ("abcdefghijklmnopqrstuvwxyz=/"
386-
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
387-
# end of input in or before attribute value, or we have the
388-
# '/' from a '/>' ending
389-
return -1
390-
if j > i:
391-
return j
392-
else:
393-
return i + 1
394-
raise AssertionError("we should not get here!")
439+
match = locatetagend.match(rawdata, i+1)
440+
assert match
441+
j = match.end()
442+
if rawdata[j-1] != ">":
443+
return -1
444+
return j
395445

396446
# Internal -- parse endtag, return end or -1 if incomplete
397447
def parse_endtag(self, i):
448+
# See the HTML5 specs section "13.2.5.7 End tag open state"
449+
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
398450
rawdata = self.rawdata
399451
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
400-
match = endendtag.search(rawdata, i+1) # >
401-
if not match:
452+
if rawdata.find('>', i+2) < 0: # fast check
402453
return -1
403-
gtpos = match.end()
404-
match = endtagfind.match(rawdata, i) # </ + tag + >
405-
if not match:
406-
if self.cdata_elem is not None:
407-
self.handle_data(rawdata[i:gtpos])
408-
return gtpos
409-
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
410-
namematch = tagfind_tolerant.match(rawdata, i+2)
411-
if not namematch:
412-
# w3.org/TR/html5/tokenization.html#end-tag-open-state
413-
if rawdata[i:i+3] == '</>':
414-
return i+3
415-
else:
416-
return self.parse_bogus_comment(i)
417-
tagname = namematch.group(1).lower()
418-
# consume and ignore other stuff between the name and the >
419-
# Note: this is not 100% correct, since we might have things like
420-
# </tag attr=">">, but looking for > after the name should cover
421-
# most of the cases and is much simpler
422-
gtpos = rawdata.find('>', namematch.end())
423-
self.handle_endtag(tagname)
424-
return gtpos+1
454+
if not endtagopen.match(rawdata, i): # </ + letter
455+
if rawdata[i+2:i+3] == '>': # </> is ignored
456+
# "missing-end-tag-name" parser error
457+
return i+3
458+
else:
459+
return self.parse_bogus_comment(i)
425460

426-
elem = match.group(1).lower() # script or style
427-
if self.cdata_elem is not None:
428-
if elem != self.cdata_elem:
429-
self.handle_data(rawdata[i:gtpos])
430-
return gtpos
461+
match = locatetagend.match(rawdata, i+2)
462+
assert match
463+
j = match.end()
464+
if rawdata[j-1] != ">":
465+
return -1
431466

432-
self.handle_endtag(elem)
467+
# find the name: "13.2.5.8 Tag name state"
468+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
469+
match = tagfind_tolerant.match(rawdata, i+2)
470+
assert match
471+
tag = match.group(1).lower()
472+
self.handle_endtag(tag)
433473
self.clear_cdata_mode()
434-
return gtpos
474+
return j
435475

436476
# Overridable -- finish processing of start+end tag: <tag.../>
437477
def handle_startendtag(self, tag, attrs):

0 commit comments

Comments
 (0)