Current File : //lib64/python3.6/urllib/robotparser.py
""" robotparser.py

    Copyright (C) 2000  Bastian Kleineidam

    You can choose between two licenses when using this package:
    1) GNU GPLv2
    2) PSF license for Python 2.2

    The robots.txt Exclusion Protocol is implemented as specified in
    http://www.robotstxt.org/norobots-rfc.txt
"""

import collections
import urllib.parse
import urllib.request

__all__ = ["RobotFileParser"]

RequestRate = collections.namedtuple("RequestRate", "requests seconds")


class RobotFileParser:
    """ This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

    """

    def __init__(self, url=''):
        self.entries = []
        self.default_entry = None
        self.disallow_all = False
        self.allow_all = False
        self.set_url(url)
        self.last_checked = 0

    def mtime(self):
        """Returns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        """
        return self.last_checked

    def modified(self):
        """Sets the time the robots.txt file was last fetched to the
        current time.

        """
        import time
        self.last_checked = time.time()

    def set_url(self, url):
        """Sets the URL referring to a robots.txt file."""
        self.url = url
        self.host, self.path = urllib.parse.urlparse(url)[1:3]

    def read(self):
        """Reads the robots.txt URL and feeds it to the parser."""
        try:
            f = urllib.request.urlopen(self.url)
        except urllib.error.HTTPError as err:
            if err.code in (401, 403):
                self.disallow_all = True
            elif err.code >= 400 and err.code < 500:
                self.allow_all = True
        else:
            raw = f.read()
            self.parse(raw.decode("utf-8").splitlines())

    def _add_entry(self, entry):
        if "*" in entry.useragents:
            # the default entry is considered last
            if self.default_entry is None:
                # the first default entry wins
                self.default_entry = entry
        else:
            self.entries.append(entry)

    def parse(self, lines):
        """Parse the input lines from a robots.txt file.

        We allow that a user-agent: line is not preceded by
        one or more blank lines.
        """
        # states:
        #   0: start state
        #   1: saw user-agent line
        #   2: saw an allow or disallow line
        state = 0
        entry = Entry()

        self.modified()
        for line in lines:
            if not line:
                if state == 1:
                    entry = Entry()
                    state = 0
                elif state == 2:
                    self._add_entry(entry)
                    entry = Entry()
                    state = 0
            # remove optional comment and strip line
            i = line.find('#')
            if i >= 0:
                line = line[:i]
            line = line.strip()
            if not line:
                continue
            line = line.split(':', 1)
            if len(line) == 2:
                line[0] = line[0].strip().lower()
                line[1] = urllib.parse.unquote(line[1].strip())
                if line[0] == "user-agent":
                    if state == 2:
                        self._add_entry(entry)
                        entry = Entry()
                    entry.useragents.append(line[1])
                    state = 1
                elif line[0] == "disallow":
                    if state != 0:
                        entry.rulelines.append(RuleLine(line[1], False))
                        state = 2
                elif line[0] == "allow":
                    if state != 0:
                        entry.rulelines.append(RuleLine(line[1], True))
                        state = 2
                elif line[0] == "crawl-delay":
                    if state != 0:
                        # before trying to convert to int we need to make
                        # sure that robots.txt has valid syntax otherwise
                        # it will crash
                        if line[1].strip().isdigit():
                            entry.delay = int(line[1])
                        state = 2
                elif line[0] == "request-rate":
                    if state != 0:
                        numbers = line[1].split('/')
                        # check if all values are sane
                        if (len(numbers) == 2 and numbers[0].strip().isdigit()
                            and numbers[1].strip().isdigit()):
                            entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
                        state = 2
        if state == 2:
            self._add_entry(entry)

    def can_fetch(self, useragent, url):
        """using the parsed robots.txt decide if useragent can fetch url"""
        if self.disallow_all:
            return False
        if self.allow_all:
            return True
        # Until the robots.txt file has been read or found not
        # to exist, we must assume that no url is allowable.
        # This prevents false positives when a user erroneously
        # calls can_fetch() before calling read().
        if not self.last_checked:
            return False
        # search for given user agent matches
        # the first match counts
        parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
        url = urllib.parse.urlunparse(('','',parsed_url.path,
            parsed_url.params,parsed_url.query, parsed_url.fragment))
        url = urllib.parse.quote(url)
        if not url:
            url = "/"
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.allowance(url)
        # try the default entry last
        if self.default_entry:
            return self.default_entry.allowance(url)
        # agent not found ==> access granted
        return True

    def crawl_delay(self, useragent):
        if not self.mtime():
            return None
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.delay
        return self.default_entry.delay

    def request_rate(self, useragent):
        if not self.mtime():
            return None
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.req_rate
        return self.default_entry.req_rate

    def __str__(self):
        entries = self.entries
        if self.default_entry is not None:
            entries = entries + [self.default_entry]
        return '\n'.join(map(str, entries)) + '\n'


class RuleLine:
    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path."""
    def __init__(self, path, allowance):
        if path == '' and not allowance:
            # an empty value means allow all
            allowance = True
        path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
        self.path = urllib.parse.quote(path)
        self.allowance = allowance

    def applies_to(self, filename):
        return self.path == "*" or filename.startswith(self.path)

    def __str__(self):
        return ("Allow" if self.allowance else "Disallow") + ": " + self.path


class Entry:
    """An entry has one or more user-agents and zero or more rulelines"""
    def __init__(self):
        self.useragents = []
        self.rulelines = []
        self.delay = None
        self.req_rate = None

    def __str__(self):
        ret = []
        for agent in self.useragents:
            ret.append(f"User-agent: {agent}")
        if self.delay is not None:
            ret.append(f"Crawl-delay: {self.delay}")
        if self.req_rate is not None:
            rate = self.req_rate
            ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
        ret.extend(map(str, self.rulelines))
        ret.append('')  # for compatibility
        return '\n'.join(ret)

    def applies_to(self, useragent):
        """check if this entry applies to the specified agent"""
        # split the name token and make it lower case
        useragent = useragent.split("/")[0].lower()
        for agent in self.useragents:
            if agent == '*':
                # we have the catch-all agent
                return True
            agent = agent.lower()
            if agent in useragent:
                return True
        return False

    def allowance(self, filename):
        """Preconditions:
        - our agent applies to this entry
        - filename is URL decoded"""
        for line in self.rulelines:
            if line.applies_to(filename):
                return line.allowance
        return True
blog

blog

Sweet Bonanza Slot by Pragmatic Play Features and Symbols.421

Sweet Bonanza Slot by Pragmatic Play – Features and Symbols ▶️ PLAY Содержимое Unlocking the Secrets of the Game The Power of the Wilds Exploring the Symbols and Their Meanings Get ready to indulge in a world of sweet treats and big wins with the Sweet Bonanza slot by Pragmatic …

Read More »

Sahabet – Sahabet Casino – Sahabet Giriş.8257

Sahabet – Sahabet Casino – Sahabet Giriş ▶️ OYNAMAK Содержимое Sahabet Giriş ve Sahabet Girişi Sahabet Girişi Güncel Yöntemler Sahabet Bahis ve Sahadanbet Sahabet Casino Hakkında Temel Bilgiler Sahabet Casino Oyunları Sahabet ve Sahabet Casino ile ilgili güncel bilgileri ve giriş yollarını anlatacağım. Sahabet, güvenli ve profesyonel bir platform olarak …

Read More »

Betting sites UK How to Make the Most of Your Bets.101

Betting sites UK – How to Make the Most of Your Bets ▶️ PLAY Содержимое Choosing the Right Bookmaker for Your Needs Top 20 Betting Sites UK: A Comprehensive Guide In the world of sports betting, the UK is a hub of activity, with numerous new betting sites emerging to …

Read More »

Casino en ligne Quatro Collection de jeux.1192

Casino en ligne Quatro – Collection de jeux ▶️ JOUER Содержимое Casino en ligne Quatro: Collection de jeux Casino en ligne Quatro: Collection de jeux Table Games Connexion and Mobile Wide Range of Games to Choose From Quatro Casino Login and Rewards High-Quality Game Providers Constantly Updated and Expanded Are …

Read More »

Казино Официальный сайт Pin Up Casino играть онлайн – Вход, Зеркало.7655

Пин Ап Казино Официальный сайт | Pin Up Casino играть онлайн – Вход, Зеркало ▶️ ИГРАТЬ Содержимое Pin Up Casino – Официальный Сайт Играть Онлайн – Вход, Зеркало Как играть на Пин Ап Казино Зеркало Пин Ап Казино В современном мире азартных игр, где каждый день появляются новые онлайн-казино, найти …

Read More »

BasariBet Casino Giriş – Canlı Casino Oyunları.1997

BasariBet Casino Giriş – Canlı Casino Oyunları ▶️ OYNAMAK Содержимое BasariBet Casino’de Canlı Casino Oyunları Nasıl Oynanır? BasariBet Casino’de Canlı Casino Oyunları için En İyi Seçenekler BasariBet Casino’de Canlı Casino Oyunları: Güvenlik ve Destek Hizmetleri BasariBet Casino, oyun sevdiklerinin en güvenilir ve eğlenceli seçeneklerinden biridir. Bu platform, kullanıcılarına canlı casino …

Read More »

1win официальный сайт букмекера — Обзор и зеркало для входа.1580

1win официальный сайт букмекера — Обзор и зеркало для входа ▶️ ИГРАТЬ Содержимое 1win Официальный Сайт Букмекера Обзор и Зеркало для Входа Преимущества и Функции 1win В мире ставок и азарта 1win является одним из самых популярных букмекеров, предлагающих широкий спектр услуг для игроков. Компания была основана в 2018 году …

Read More »

сайт – онлайн казино и покер рум – Вход.4789

Покердом официальный сайт – онлайн казино и покер рум – Вход ▶️ ИГРАТЬ Содержимое Покердом – официальный сайт онлайн казино и покер рума Преимущества Покердома Преимущества игры на официальном сайте PokerDom Как зарегистрироваться и начать играть на Pokerdom Возможности онлайн казино и покер рума Большой выбор игр Бонусы и акции …

Read More »

1Win Azerbaijan – dman Mrclri v Casino sayt.641

1Win Azerbaijan – İdman Mərcləri və Casino saytı ▶️ OYNA Содержимое 1Win Azerbaijan haqqında məlumatlar Idman mərcələrindən istifadə edən məsləhətlər Casino saytı haqqında məlumatlar Idman mərcələrindən və casino saytı ilə bağlı məlumatlar 1Win Azerbaijan – bu idman mərcələr və casino xidmətlərindən istifadə etmək üçün ən yaxşı veb sayt. Bu saytda …

Read More »

– Официальный сайт Pinco Casino.1586

Пинко Казино – Официальный сайт Pinco Casino ▶️ ИГРАТЬ Содержимое Преимущества игры на официальном сайте Pinco Casino Виды игр и слотов на официальном сайте Pinco Casino Бонусы и акции на официальном сайте Pinco Casino В наше время казино стали популярными развлечениями для многих людей. Многие из них ищут новые и …

Read More »