2727attr_charref = re .compile (r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?' )
2828
2929starttagopen = re .compile ('<[a-zA-Z]' )
30+ endtagopen = re .compile ('</[a-zA-Z]' )
3031piclose = re .compile ('>' )
31- commentclose = re .compile (r'--\s*>' )
32+ commentclose = re .compile (r'--!?>' )
33+ commentabruptclose = re .compile (r'-?>' )
3234# Note:
33- # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
34- # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
35+ # 1) if you change tagfind/attrfind remember to update locatetagend too;
36+ # 2) if you change tagfind/attrfind and/or locatetagend the parser will
3537# explode, so don't do it.
36- # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
37- # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
38- tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*' )
39- attrfind_tolerant = re .compile (
40- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
41- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*' )
38+ # see the HTML5 specs section "13.2.5.6 Tag open state",
39+ # "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
40+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
41+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
42+ # https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
43+ tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*' )
44+ attrfind_tolerant = re .compile (r"""
45+ (
46+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
47+ )
48+ ([\t\n\r\f ]*=[\t\n\r\f ]* # value indicator
49+ ('[^']*' # LITA-enclosed value
50+ |"[^"]*" # LIT-enclosed value
51+ |(?!['"])[^>\t\n\r\f ]* # bare value
52+ )
53+ )?
54+ (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
55+ """ , re .VERBOSE )
56+ locatetagend = re .compile (r"""
57+ [a-zA-Z][^\t\n\r\f />]* # tag name
58+ [\t\n\r\f /]* # optional whitespace before attribute name
59+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
60+ (?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator
61+ (?:'[^']*' # LITA-enclosed value
62+ |"[^"]*" # LIT-enclosed value
63+ |(?!['"])[^>\t\n\r\f ]* # bare value
64+ )
65+ )?
66+ [\t\n\r\f /]* # possibly followed by a space
67+ )*
68+ >?
69+ """ , re .VERBOSE )
70+ # The following variables are not used, but are temporarily left for
71+ # backward compatibility.
4272locatestarttagend_tolerant = re .compile (r"""
4373 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4474 (?:[\s/]* # optional whitespace before attribute name
5585 \s* # trailing whitespace
5686""" , re .VERBOSE )
5787endendtag = re .compile ('>' )
58- # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
59- # </ and the tag name, so maybe this should be fixed
6088endtagfind = re .compile (r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
6189
6290# Character reference processing logic specific to attribute values
@@ -100,6 +128,7 @@ class HTMLParser(_markupbase.ParserBase):
100128 """
101129
102130 CDATA_CONTENT_ELEMENTS = ("script" , "style" )
131+ RCDATA_CONTENT_ELEMENTS = ("textarea" , "title" )
103132
104133 def __init__ (self , * , convert_charrefs = True ):
105134 """Initialize and reset this instance.
@@ -117,6 +146,7 @@ def reset(self):
117146 self .lasttag = '???'
118147 self .interesting = interesting_normal
119148 self .cdata_elem = None
149+ self ._escapable = True
120150 super ().reset ()
121151
122152 def feed (self , data ):
@@ -138,13 +168,20 @@ def get_starttag_text(self):
138168 """Return full source of start tag: '<...>'."""
139169 return self .__starttag_text
140170
141- def set_cdata_mode (self , elem ):
171+ def set_cdata_mode (self , elem , * , escapable = False ):
142172 self .cdata_elem = elem .lower ()
143- self .interesting = re .compile (r'</\s*%s\s*>' % self .cdata_elem , re .I )
173+ self ._escapable = escapable
174+ if escapable and not self .convert_charrefs :
175+ self .interesting = re .compile (r'&|</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
176+ re .IGNORECASE | re .ASCII )
177+ else :
178+ self .interesting = re .compile (r'</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
179+ re .IGNORECASE | re .ASCII )
144180
145181 def clear_cdata_mode (self ):
146182 self .interesting = interesting_normal
147183 self .cdata_elem = None
184+ self ._escapable = True
148185
149186 # Internal -- handle data as far as reasonable. May leave state
150187 # and data to be processed by a subsequent call. If 'end' is
@@ -165,7 +202,7 @@ def goahead(self, end):
165202 # & near the end and see if it's followed by a space or ;.
166203 amppos = rawdata .rfind ('&' , max (i , n - 34 ))
167204 if (amppos >= 0 and
168- not re .compile (r'[\s ;]' ).search (rawdata , amppos )):
205+ not re .compile (r'[\t\n\r\f ;]' ).search (rawdata , amppos )):
169206 break # wait till we get all the text
170207 j = n
171208 else :
@@ -177,7 +214,7 @@ def goahead(self, end):
177214 break
178215 j = n
179216 if i < j :
180- if self .convert_charrefs and not self .cdata_elem :
217+ if self .convert_charrefs and self ._escapable :
181218 self .handle_data (unescape (rawdata [i :j ]))
182219 else :
183220 self .handle_data (rawdata [i :j ])
@@ -195,25 +232,43 @@ def goahead(self, end):
195232 k = self .parse_pi (i )
196233 elif startswith ("<!" , i ):
197234 k = self .parse_html_declaration (i )
198- elif (i + 1 ) < n :
235+ elif (i + 1 ) < n or end :
199236 self .handle_data ("<" )
200237 k = i + 1
201238 else :
202239 break
203240 if k < 0 :
204241 if not end :
205242 break
206- k = rawdata .find ('>' , i + 1 )
207- if k < 0 :
208- k = rawdata .find ('<' , i + 1 )
209- if k < 0 :
210- k = i + 1
211- else :
212- k += 1
213- if self .convert_charrefs and not self .cdata_elem :
214- self .handle_data (unescape (rawdata [i :k ]))
243+ if starttagopen .match (rawdata , i ): # < + letter
244+ pass
245+ elif startswith ("</" , i ):
246+ if i + 2 == n :
247+ self .handle_data ("</" )
248+ elif endtagopen .match (rawdata , i ): # </ + letter
249+ pass
250+ else :
251+ # bogus comment
252+ self .handle_comment (rawdata [i + 2 :])
253+ elif startswith ("<!--" , i ):
254+ j = n
255+ for suffix in ("--!" , "--" , "-" ):
256+ if rawdata .endswith (suffix , i + 4 ):
257+ j -= len (suffix )
258+ break
259+ self .handle_comment (rawdata [i + 4 :j ])
260+ elif startswith ("<![CDATA[" , i ):
261+ self .unknown_decl (rawdata [i + 3 :])
262+ elif rawdata [i :i + 9 ].lower () == '<!doctype' :
263+ self .handle_decl (rawdata [i + 2 :])
264+ elif startswith ("<!" , i ):
265+ # bogus comment
266+ self .handle_comment (rawdata [i + 2 :])
267+ elif startswith ("<?" , i ):
268+ self .handle_pi (rawdata [i + 2 :])
215269 else :
216- self .handle_data (rawdata [i :k ])
270+ raise AssertionError ("we should not get here!" )
271+ k = n
217272 i = self .updatepos (i , k )
218273 elif startswith ("&#" , i ):
219274 match = charref .match (rawdata , i )
@@ -261,7 +316,7 @@ def goahead(self, end):
261316 assert 0 , "interesting.search() lied"
262317 # end while
263318 if end and i < n :
264- if self .convert_charrefs and not self .cdata_elem :
319+ if self .convert_charrefs and self ._escapable :
265320 self .handle_data (unescape (rawdata [i :n ]))
266321 else :
267322 self .handle_data (rawdata [i :n ])
@@ -290,8 +345,23 @@ def parse_html_declaration(self, i):
290345 else :
291346 return self .parse_bogus_comment (i )
292347
348+ # Internal -- parse comment, return length or -1 if not terminated
349+ # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
350+ def parse_comment (self , i , report = True ):
351+ rawdata = self .rawdata
352+ assert rawdata .startswith ('<!--' , i ), 'unexpected call to parse_comment()'
353+ match = commentclose .search (rawdata , i + 4 )
354+ if not match :
355+ match = commentabruptclose .match (rawdata , i + 4 )
356+ if not match :
357+ return - 1
358+ if report :
359+ j = match .start ()
360+ self .handle_comment (rawdata [i + 4 : j ])
361+ return match .end ()
362+
293363 # Internal -- parse bogus comment, return length or -1 if not terminated
294- # see http ://www.w3. org/TR/html5/tokenization .html#bogus-comment-state
364+ # see https ://html.spec.whatwg. org/multipage/parsing .html#bogus-comment-state
295365 def parse_bogus_comment (self , i , report = 1 ):
296366 rawdata = self .rawdata
297367 assert rawdata [i :i + 2 ] in ('<!' , '</' ), ('unexpected call to '
@@ -317,6 +387,8 @@ def parse_pi(self, i):
317387
318388 # Internal -- handle starttag, return end or -1 if not terminated
319389 def parse_starttag (self , i ):
390+ # See the HTML5 specs section "13.2.5.8 Tag name state"
391+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
320392 self .__starttag_text = None
321393 endpos = self .check_for_whole_start_tag (i )
322394 if endpos < 0 :
@@ -356,82 +428,50 @@ def parse_starttag(self, i):
356428 self .handle_starttag (tag , attrs )
357429 if tag in self .CDATA_CONTENT_ELEMENTS :
358430 self .set_cdata_mode (tag )
431+ elif tag in self .RCDATA_CONTENT_ELEMENTS :
432+ self .set_cdata_mode (tag , escapable = True )
359433 return endpos
360434
361435 # Internal -- check to see if we have a complete starttag; return end
362436 # or -1 if incomplete.
363437 def check_for_whole_start_tag (self , i ):
364438 rawdata = self .rawdata
365- m = locatestarttagend_tolerant .match (rawdata , i )
366- if m :
367- j = m .end ()
368- next = rawdata [j :j + 1 ]
369- if next == ">" :
370- return j + 1
371- if next == "/" :
372- if rawdata .startswith ("/>" , j ):
373- return j + 2
374- if rawdata .startswith ("/" , j ):
375- # buffer boundary
376- return - 1
377- # else bogus input
378- if j > i :
379- return j
380- else :
381- return i + 1
382- if next == "" :
383- # end of input
384- return - 1
385- if next in ("abcdefghijklmnopqrstuvwxyz=/"
386- "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ):
387- # end of input in or before attribute value, or we have the
388- # '/' from a '/>' ending
389- return - 1
390- if j > i :
391- return j
392- else :
393- return i + 1
394- raise AssertionError ("we should not get here!" )
439+ match = locatetagend .match (rawdata , i + 1 )
440+ assert match
441+ j = match .end ()
442+ if rawdata [j - 1 ] != ">" :
443+ return - 1
444+ return j
395445
396446 # Internal -- parse endtag, return end or -1 if incomplete
397447 def parse_endtag (self , i ):
448+ # See the HTML5 specs section "13.2.5.7 End tag open state"
449+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
398450 rawdata = self .rawdata
399451 assert rawdata [i :i + 2 ] == "</" , "unexpected call to parse_endtag"
400- match = endendtag .search (rawdata , i + 1 ) # >
401- if not match :
452+ if rawdata .find ('>' , i + 2 ) < 0 : # fast check
402453 return - 1
403- gtpos = match .end ()
404- match = endtagfind .match (rawdata , i ) # </ + tag + >
405- if not match :
406- if self .cdata_elem is not None :
407- self .handle_data (rawdata [i :gtpos ])
408- return gtpos
409- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
410- namematch = tagfind_tolerant .match (rawdata , i + 2 )
411- if not namematch :
412- # w3.org/TR/html5/tokenization.html#end-tag-open-state
413- if rawdata [i :i + 3 ] == '</>' :
414- return i + 3
415- else :
416- return self .parse_bogus_comment (i )
417- tagname = namematch .group (1 ).lower ()
418- # consume and ignore other stuff between the name and the >
419- # Note: this is not 100% correct, since we might have things like
420- # </tag attr=">">, but looking for > after the name should cover
421- # most of the cases and is much simpler
422- gtpos = rawdata .find ('>' , namematch .end ())
423- self .handle_endtag (tagname )
424- return gtpos + 1
454+ if not endtagopen .match (rawdata , i ): # </ + letter
455+ if rawdata [i + 2 :i + 3 ] == '>' : # </> is ignored
456+ # "missing-end-tag-name" parser error
457+ return i + 3
458+ else :
459+ return self .parse_bogus_comment (i )
425460
426- elem = match . group ( 1 ). lower () # script or style
427- if self . cdata_elem is not None :
428- if elem != self . cdata_elem :
429- self . handle_data ( rawdata [i : gtpos ])
430- return gtpos
461+ match = locatetagend . match ( rawdata , i + 2 )
462+ assert match
463+ j = match . end ()
464+ if rawdata [j - 1 ] != ">" :
465+ return - 1
431466
432- self .handle_endtag (elem )
467+ # find the name: "13.2.5.8 Tag name state"
468+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
469+ match = tagfind_tolerant .match (rawdata , i + 2 )
470+ assert match
471+ tag = match .group (1 ).lower ()
472+ self .handle_endtag (tag )
433473 self .clear_cdata_mode ()
434- return gtpos
474+ return j
435475
436476 # Overridable -- finish processing of start+end tag: <tag.../>
437477 def handle_startendtag (self , tag , attrs ):
0 commit comments