diff --git a/examples/complete_example.py b/examples/complete_example.py index ddc9b96..4de544c 100644 --- a/examples/complete_example.py +++ b/examples/complete_example.py @@ -34,4 +34,11 @@ robots_file.add_user_agent(ua_general) robots_file.add_user_agent(ua_general_google) -robots_file.write("robots.txt") \ No newline at end of file +robots_file.write("robots.txt") + +# Read Remote File +robots_file_2 = RobotsTxt() +robots_file_2.read("https://nike.com/robots.txt") +robots_file_2.write("nike_robots.txt") + +print (robots_file_2.robots_details("Baiduspider")) diff --git a/examples/robots.txt b/examples/robots.txt index d9330a5..6eea95c 100644 --- a/examples/robots.txt +++ b/examples/robots.txt @@ -1,27 +1,23 @@ # Welcome Crawlers -# Created on 2022-12-04 14:21:53.368205 using pyrobotstxt - -User-agent: * - +# Created on 2023-03-17 23:42:50.589282 using pyrobotstxtUser-agent: * # Allowed Patterns Allow: /home Allow: /deep # Disallowed Patterns Disallow: /topi?a -Disallow: /img*.png$ Disallow: /nopi$ +Disallow: /img*.png$ User-agent: Google - # Allowed Patterns Allow: /home Allow: /deep # Disallowed Patterns Disallow: /topi?a -Disallow: /img*.png$ Disallow: /nopi$ +Disallow: /img*.png$ # Site Maps Sitemap: https://seowings.org/sitemap.xml diff --git a/mkdocs.yml b/mkdocs.yml index f76f647..f30061c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -31,4 +31,4 @@ markdown_extensions: theme: readthedocs -copyright: © Copyright 2022 Faisal Shahzad (seowings.org) \ No newline at end of file +copyright: © Copyright 2022-2023 Faisal Shahzad (seowings.org) \ No newline at end of file diff --git a/pyrobotstxt/__init__.py b/pyrobotstxt/__init__.py index 5d588fb..2bc5c4c 100644 --- a/pyrobotstxt/__init__.py +++ b/pyrobotstxt/__init__.py @@ -24,14 +24,24 @@ """ # +++++++++++++++++++++++++++++++++++++++++++++++++++++ -# IMPORTS +# IMPORTS Standard Library # +++++++++++++++++++++++++++++++++++++++++++++++++++++ +import re +from unittest.mock import Mock import os import json from math import ceil from datetime import datetime +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ +# IMPORTS 3rd Party Libraries +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ + +import requests +from requests.adapters import HTTPAdapter +from requests.models import Response +from bs4 import BeautifulSoup from PIL import Image # +++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -62,6 +72,45 @@ "xenu": "xenu", } +HEADER = { + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0", +} + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ +# UTIL FUNCTIONS +# +++++++++++++++++++++++++++++++++++++++++++++++++++++ + + +def mock_requests_object(url): + """ """ + response = Mock(spec=Response) + response.text = "" + response.status_code = 9999 + response.url = url + return response + + +def get_remote_content(url, max_retires=5): + """ """ + try: + s = requests.Session() + s.mount(url, HTTPAdapter(max_retries=max_retires)) + return s.get(url, headers=HEADER) + except: + return mock_requests_object(url) + + +def get_corrected_url(url, fix_slash="sitemap.xml"): + """ """ + if not url.startswith("http://") and not url.startswith("https://"): + url = f"http://{url}" + + if not url.endswith(fix_slash): + url = f"{url}/{fix_slash}" + + return url + + # +++++++++++++++++++++++++++++++++++++++++++++++++++++ # CLASSES # +++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -213,7 +262,7 @@ def disallow_pagination(self, prefix="/page/*", comments=""): def consolidate(self): """consolidate all the information (allowed, disallowed, sitemaps) in single text string.""" - self.content = f"\n\nUser-agent: {self.user_agent_name}" + self.content = f"User-agent: {self.user_agent_name}" # Support for including Crawl_delay. see feature request #1 if self.crawl_delay > 0: @@ -231,6 +280,8 @@ def consolidate(self): self.content += "\n\n# Site Maps\n" self.content += "\n".join([f"Sitemap: {item}" for item in self.sitemaps]) + self.content += "\n\n" + class RobotsTxt: def __init__(self, version=""): @@ -246,10 +297,43 @@ def __init__(self, version=""): self.header = "" # message added to the start of the output file. self.footer = "" # message added to the end of the output file. - def read(self): - """read a robots.txt File (TODO)""" + def read(self, robots_url): + """read a robots.txt File""" self.create_time = datetime.now() + robots_url = get_corrected_url(robots_url, "") + response = get_remote_content(robots_url) + + if response.status_code < 400: + for ua_item in response.text.split("User-agent:"): + if ua_item: + ua_content_items = [ + ua_split_item.strip() for ua_split_item in ua_item.split("\n") if ua_split_item + ] + if not ua_content_items[0].startswith("#"): + ua = UserAgent(ua_name=ua_content_items[0]) + ua.add_allow( + [ + it.split("Allow:")[-1] + for it in ua_content_items[1:] + if it.startswith("Allow:") + ] + ) + ua.add_disallow( + [ + it.split("Disallow:")[-1] + for it in ua_content_items[1:] + if it.startswith("Disallow:") + ] + ) + # TODO: Comments are not included Yet + comment = [ + it.split("# ")[-1] + for it in ua_content_items[1:] + if it.startswith("#") + ] + + self.add_user_agent(ua=ua) def write(self, file_path="robots.txt"): """write robots.txt file at a given file_path location. @@ -268,7 +352,7 @@ def write(self, file_path="robots.txt"): ua.consolidate() f.write(ua.content) - f.write("\n\n") + f.write("\n") # append ascii image, if available if self.image_branding: @@ -276,7 +360,7 @@ def write(self, file_path="robots.txt"): # append footer message if self.footer: - f.write(f"\n\n# {self.footer}") + f.write(f"\n# {self.footer}") def include_header(self, message="", append_date=True): """include header message with/without creation date. @@ -286,7 +370,7 @@ def include_header(self, message="", append_date=True): append_date (bool, optional): Append date/time to the header. Defaults to True. """ - self.header = f"{message}" + self.header = message if append_date: self.header += f"\n# Created on {self.create_time} using pyrobotstxt" diff --git a/setup.py b/setup.py index b7dd6fb..451834e 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ ], packages=["pyrobotstxt"], python_requires=">=3.9", - install_requires=["pillow==9.3.0"], + install_requires=["pillow==9.3.0", "requests==2.28.2", "beautifulsoup4==4.11.2"], extras_require={ "dev": [ "setuptools",