"""A parser for HTML and XHTML.""" # This file is based on sgmllib.py, but the API is slightly different. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import markupbase import re # Regular expressions used for parsing interesting_normal = re.compile('[&<]') incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') attrfind = re.compile( r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between # </ and the tag name, so maybe this should be fixed endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') class HTMLParseError(Exception): """Exception raised for all parse errors.""" def __init__(self, msg, position=(None, None)): assert msg self.msg = msg self.lineno = position[0] self.offset = position[1] def __str__(self): result = self.msg if self.lineno is not None: result = result + ", at line %d" % self.lineno if self.offset is not None: result = result + ", column %d" % (self.offset + 1) return result class HTMLParser(markupbase.ParserBase): """Find tags and other markup and call handler functions. Usage: p = HTMLParser() p.feed(data) ... p.close() Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). Entity references are passed by calling self.handle_entityref() with the entity reference as the argument. Numeric character references are passed to self.handle_charref() with the string containing the reference as the argument. """ CDATA_CONTENT_ELEMENTS = ("script", "style") def __init__(self): """Initialize and reset this instance.""" self.reset() def reset(self): """Reset this instance. Loses all unprocessed data.""" self.rawdata = '' self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None markupbase.ParserBase.reset(self) def feed(self, data): r"""Feed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). """ self.rawdata = self.rawdata + data self.goahead(0) def close(self): """Handle any buffered data.""" self.goahead(1) def error(self, message): raise HTMLParseError(message, self.getpos()) __starttag_text = None def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: match = self.interesting.search(rawdata, i) # < or & if match: j = match.start() else: if self.cdata_elem: break j = n if i < j: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break startswith = rawdata.startswith if startswith('<', i): if starttagopen.match(rawdata, i): # < + letter k = self.parse_starttag(i) elif startswith("</", i): k = self.parse_endtag(i) elif startswith("<!--", i): k = self.parse_comment(i) elif startswith("<?", i): k = self.parse_pi(i) elif startswith("<!", i): k = self.parse_html_declaration(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 else: break if k < 0: if not end: break k = rawdata.find('>', i + 1) if k < 0: k = rawdata.find('<', i + 1) if k < 0: k = i + 1 else: k += 1 self.handle_data(rawdata[i:k]) i = self.updatepos(i, k) elif startswith("&#", i): match = charref.match(rawdata, i) if match: name = match.group()[2:-1] self.handle_charref(name) k = match.end() if not startswith(';', k-1): k = k - 1 i = self.updatepos(i, k) continue else: if ";" in rawdata[i:]: #bail by consuming &# self.handle_data(rawdata[0:2]) i = self.updatepos(i, 2) break elif startswith('&', i): match = entityref.match(rawdata, i) if match: name = match.group(1) self.handle_entityref(name) k = match.end() if not startswith(';', k-1): k = k - 1 i = self.updatepos(i, k) continue match = incomplete.match(rawdata, i) if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: self.error("EOF in middle of entity or char ref") # incomplete break elif (i + 1) < n: # not the end of the buffer, and can't be confused # with some other construct self.handle_data("&") i = self.updatepos(i, i + 1) else: break else: assert 0, "interesting.search() lied" # end while if end and i < n and not self.cdata_elem: self.handle_data(rawdata[i:n]) i = self.updatepos(i, n) self.rawdata = rawdata[i:] # Internal -- parse html declarations, return length or -1 if not terminated # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state # See also parse_declaration in _markupbase def parse_html_declaration(self, i): rawdata = self.rawdata if rawdata[i:i+2] != '<!': self.error('unexpected call to parse_html_declaration()') if rawdata[i:i+4] == '<!--': # this case is actually already handled in goahead() return self.parse_comment(i) elif rawdata[i:i+3] == '<![': return self.parse_marked_section(i) elif rawdata[i:i+9].lower() == '<!doctype': # find the closing > gtpos = rawdata.find('>', i+9) if gtpos == -1: return -1 self.handle_decl(rawdata[i+2:gtpos]) return gtpos+1 else: return self.parse_bogus_comment(i) # Internal -- parse bogus comment, return length or -1 if not terminated # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state def parse_bogus_comment(self, i, report=1): rawdata = self.rawdata if rawdata[i:i+2] not in ('<!', '</'): self.error('unexpected call to parse_comment()') pos = rawdata.find('>', i+2) if pos == -1: return -1 if report: self.handle_comment(rawdata[i+2:pos]) return pos + 1 # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' match = piclose.search(rawdata, i+2) # > if not match: return -1 j = match.start() self.handle_pi(rawdata[i+2: j]) j = match.end() return j # Internal -- handle starttag, return end or -1 if not terminated def parse_starttag(self, i): self.__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: return endpos rawdata = self.rawdata self.__starttag_text = rawdata[i:endpos] # Now parse the data between i+1 and j into a tag and attrs attrs = [] match = tagfind.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = match.group(1).lower() while k < endpos: m = attrfind.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) if not rest: attrvalue = None elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() if end not in (">", "/>"): lineno, offset = self.getpos() if "\n" in self.__starttag_text: lineno = lineno + self.__starttag_text.count("\n") offset = len(self.__starttag_text) \ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): # XHTML-style empty tag: <span attr="value" /> self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) return endpos # Internal -- check to see if we have a complete starttag; return end # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata m = locatestarttagend.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] if next == ">": return j + 1 if next == "/": if rawdata.startswith("/>", j): return j + 2 if rawdata.startswith("/", j): # buffer boundary return -1 # else bogus input self.updatepos(i, j + 1) self.error("malformed empty start tag") if next == "": # end of input return -1 if next in ("abcdefghijklmnopqrstuvwxyz=/" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 if j > i: return j else: return i + 1 raise AssertionError("we should not get here!") # Internal -- parse endtag, return end or -1 if incomplete def parse_endtag(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" match = endendtag.search(rawdata, i+1) # > if not match: return -1 gtpos = match.end() match = endtagfind.match(rawdata, i) # </ + tag + > if not match: if self.cdata_elem is not None: self.handle_data(rawdata[i:gtpos]) return gtpos # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind_tolerant.match(rawdata, i+2) if not namematch: # w3.org/TR/html5/tokenization.html#end-tag-open-state if rawdata[i:i+3] == '</>': return i+3 else: return self.parse_bogus_comment(i) tagname = namematch.group().lower() # consume and ignore other stuff between the name and the > # Note: this is not 100% correct, since we might have things like # </tag attr=">">, but looking for > after tha name should cover # most of the cases and is much simpler gtpos = rawdata.find('>', namematch.end()) self.handle_endtag(tagname) return gtpos+1 elem = match.group(1).lower() # script or style if self.cdata_elem is not None: if elem != self.cdata_elem: self.handle_data(rawdata[i:gtpos]) return gtpos self.handle_endtag(elem) self.clear_cdata_mode() return gtpos # Overridable -- finish processing of start+end tag: <tag.../> def handle_startendtag(self, tag, attrs): self.handle_starttag(tag, attrs) self.handle_endtag(tag) # Overridable -- handle start tag def handle_starttag(self, tag, attrs): pass # Overridable -- handle end tag def handle_endtag(self, tag): pass # Overridable -- handle character reference def handle_charref(self, name): pass # Overridable -- handle entity reference def handle_entityref(self, name): pass # Overridable -- handle data def handle_data(self, data): pass # Overridable -- handle comment def handle_comment(self, data): pass # Overridable -- handle declaration def handle_decl(self, decl): pass # Overridable -- handle processing instruction def handle_pi(self, data): pass def unknown_decl(self, data): pass # Internal -- helper to remove special character quoting entitydefs = None def unescape(self, s): if '&' not in s: return s def replaceEntities(s): s = s.groups()[0] try: if s[0] == "#": s = s[1:] if s[0] in ['x','X']: c = int(s[1:], 16) else: c = int(s) return unichr(c) except ValueError: return '&#'+s+';' else: # Cannot use name2codepoint directly, because HTMLParser supports apos, # which is not part of HTML 4 import htmlentitydefs if HTMLParser.entitydefs is None: entitydefs = HTMLParser.entitydefs = {'apos':u"'"} for k, v in htmlentitydefs.name2codepoint.iteritems(): entitydefs[k] = unichr(v) try: return self.entitydefs[s] except KeyError: return '&'+s+';' return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)

blog

Онлайн казино с моментальным выводом и привлекательными бонусами

June 20, 2025 blog 0

Онлайн казино с моментальным выводом и привлекательными бонусами Виртуальные гэмблинг-платформы с мгновенным выплатой средств становятся все более привлекательными среди пользователей, стремящихся к максимальному удобству и оперативности. В подобных онлайн-казино необходимо не только наличие обширного выбора развлечений, но и скорость обработки транзакций. Посредством новейших разработок, многочисленные сервисы, например, как вавада, предлагают …

Официальный Сайт Играть в Онлайн Казино Pinco.1661 (3)

June 19, 2025 blog 0

Пинко Казино Официальный Сайт – Играть в Онлайн Казино Pinco ▶️ ИГРАТЬ Содержимое Pinco Casino: Официальный Сайт Возможности Онлайн Казино Преимущества игроков в Pinco Казино Большой выбор игр Лучшие условия для игроков 24/7 поддержка Как Играть в Pinco Casino В мире онлайн-казино есть много вариантов для игроков, но не все …

1win — скачать приложение букмекерской конторы.4289

June 19, 2025 blog 0

1win — скачать приложение букмекерской конторы ▶️ ИГРАТЬ Содержимое Установка приложения 1win Функциональность приложения 1win Главные функции Преимущества использования приложения 1win Как скачать приложение 1win Обзор безопасности приложения 1win В мире ставок и азарта 1вин является одним из самых популярных букмекерских контор, которые предлагают своим клиентам широкий спектр услуг и …

Cresus casino en ligne Inscription et connexion.2051

June 19, 2025 blog 0

Cresus casino en ligne – Inscription et connexion ▶️ JOUER Содержимое Cresus Casino en Ligne : Inscription et Connexion Inscription au Cresus Casino Connexion au Cresus Casino Inscription au Cresus Casino en Ligne Création de votre compte Cresus Casino Connexion au Cresus Casino en Ligne Connexion avec votre compte Cresus …

Официальный Сайт Вход на Рабочее Зеркало Vavada.2091

June 18, 2025 blog 0

Вавада Казино Официальный Сайт – Вход на Рабочее Зеркало Vavada ▶️ ИГРАТЬ Содержимое Vavada Casino Official Website: Access to the Working Mirror Vavada Вавада зеркало: что это и почему его нужно? Вавада вход: как получить доступ к играм? Vavada Casino Official Website: Access to the Working Mirror Vavada What is …

Gates of Olympus Slot Türkiye.3298

June 18, 2025 blog 0

Gates of Olympus Slot Türkiye ▶️ OYNAMAK Содержимое Gates of Olympus Slot Nasıl Oynanır Gates of Olympus Slot Özellikleri ve Sembolleri Gates of Olympus Slot Kazanç Oranları ve Ödülleri Ödüller ve Kazanç Oranları Kazanç Oranları ve Ödüllerin Ayrıntıları gates of olympus oyna, Yunan mitolojisine dayanan bir slot oyunudur. Gates of …

Casibom Casino – Güvenilir Online Casino Giriş Adresi.4680

June 18, 2025 blog 0

Casibom Casino – Güvenilir Online Casino Giriş Adresi ▶️ OYNAMAK Содержимое Casibom Casino Hakkında Genel Bilgiler Güvenlik ve Güvenilirlik Casibom Casino’da Güvenliği Nasıl Garantiedir? Şifreleme ve Güvenlik Protokolleri Casibom Casino’da Oynayabileceğiniz Oyunlar casibom , en güvenilir online casino sitelerinden biridir. Güvenli ve hızlı bir giriş deneyimi sunar. Casibom giriş sayfasından …

Официальный Сайт Играть в Онлайн Казино Pinco.3744 (2)

June 17, 2025 blog 0

Пинко Казино Официальный Сайт – Играть в Онлайн Казино Pinco ▶️ ИГРАТЬ Содержимое Удобство и Безопасность в Казино Pinco Возможности и Бонусы Бонусы для новых игроков Преимущества игроков В современном мире интернета и технологий, казино стали одним из самых популярных способов развлечения и заработка. В этом тексте мы будем говорить …

Casinos en ligne légitimes pour les joueurs français.388

June 16, 2025 blog 0

Casinos en ligne légitimes pour les joueurs français ▶️ JOUER Содержимое Les avantages de jouer dans un casino en ligne légitime Comment choisir un casino en ligne fiable et sérésé Les règles et les lois régissant les casinos en ligne en France Conseils pour jouer de manière responsable dans un …

Alexander Casino Avis 2024 Bonus Gratuit 100€ + 100 FS.1209 (2)

June 16, 2025 blog 0

Содержимое Le Casino Alexandre : Un Avis 2024 sur les Bonus Gratuits de 100€ + 100 FS Alexander Casino Avis 2024 – Bonus Gratuit 100€ + 100 FS Présentation de l’entreprise Le bonus gratuit de 100€ + 100 FS Les jeux de casino proposés Conseils et astuces pour jouer à …

Page 11 of 16« First...«9 101112 13 » ...Last »