Skip to content

Commit

Permalink
read remote file fix for #2
Browse files Browse the repository at this point in the history
  • Loading branch information
seowings committed Mar 17, 2023
1 parent 80fec52 commit 7da3eb2
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 17 deletions.
9 changes: 8 additions & 1 deletion examples/complete_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,11 @@
robots_file.add_user_agent(ua_general)
robots_file.add_user_agent(ua_general_google)

robots_file.write("robots.txt")
robots_file.write("robots.txt")

# Read Remote File
robots_file_2 = RobotsTxt()
robots_file_2.read("https://nike.com/robots.txt")
robots_file_2.write("nike_robots.txt")

print (robots_file_2.robots_details("Baiduspider"))
10 changes: 3 additions & 7 deletions examples/robots.txt
Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
# Welcome Crawlers
# Created on 2022-12-04 14:21:53.368205 using pyrobotstxt

User-agent: *

# Created on 2023-03-17 23:42:50.589282 using pyrobotstxtUser-agent: *
# Allowed Patterns
Allow: /home
Allow: /deep

# Disallowed Patterns
Disallow: /topi?a
Disallow: /img*.png$
Disallow: /nopi$
Disallow: /img*.png$

User-agent: Google

# Allowed Patterns
Allow: /home
Allow: /deep

# Disallowed Patterns
Disallow: /topi?a
Disallow: /img*.png$
Disallow: /nopi$
Disallow: /img*.png$

# Site Maps
Sitemap: https://seowings.org/sitemap.xml
Expand Down
2 changes: 1 addition & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ markdown_extensions:

theme: readthedocs

copyright: © Copyright 2022 Faisal Shahzad (seowings.org)
copyright: © Copyright 2022-2023 Faisal Shahzad (seowings.org)
98 changes: 91 additions & 7 deletions pyrobotstxt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,24 @@
"""

# +++++++++++++++++++++++++++++++++++++++++++++++++++++
# IMPORTS
# IMPORTS Standard Library
# +++++++++++++++++++++++++++++++++++++++++++++++++++++

import re
from unittest.mock import Mock
import os
import json
from math import ceil
from datetime import datetime

# +++++++++++++++++++++++++++++++++++++++++++++++++++++
# IMPORTS 3rd Party Libraries
# +++++++++++++++++++++++++++++++++++++++++++++++++++++

import requests
from requests.adapters import HTTPAdapter
from requests.models import Response
from bs4 import BeautifulSoup
from PIL import Image

# +++++++++++++++++++++++++++++++++++++++++++++++++++++
Expand Down Expand Up @@ -62,6 +72,45 @@
"xenu": "xenu",
}

HEADER = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
}

# +++++++++++++++++++++++++++++++++++++++++++++++++++++
# UTIL FUNCTIONS
# +++++++++++++++++++++++++++++++++++++++++++++++++++++


def mock_requests_object(url):
""" """
response = Mock(spec=Response)
response.text = ""
response.status_code = 9999
response.url = url
return response


def get_remote_content(url, max_retires=5):
""" """
try:
s = requests.Session()
s.mount(url, HTTPAdapter(max_retries=max_retires))
return s.get(url, headers=HEADER)
except:
return mock_requests_object(url)


def get_corrected_url(url, fix_slash="sitemap.xml"):
""" """
if not url.startswith("http://") and not url.startswith("https://"):
url = f"http://{url}"

if not url.endswith(fix_slash):
url = f"{url}/{fix_slash}"

return url


# +++++++++++++++++++++++++++++++++++++++++++++++++++++
# CLASSES
# +++++++++++++++++++++++++++++++++++++++++++++++++++++
Expand Down Expand Up @@ -213,7 +262,7 @@ def disallow_pagination(self, prefix="/page/*", comments=""):
def consolidate(self):
"""consolidate all the information (allowed, disallowed, sitemaps) in single text string."""

self.content = f"\n\nUser-agent: {self.user_agent_name}"
self.content = f"User-agent: {self.user_agent_name}"

# Support for including Crawl_delay. see feature request #1
if self.crawl_delay > 0:
Expand All @@ -231,6 +280,8 @@ def consolidate(self):
self.content += "\n\n# Site Maps\n"
self.content += "\n".join([f"Sitemap: {item}" for item in self.sitemaps])

self.content += "\n\n"


class RobotsTxt:
def __init__(self, version=""):
Expand All @@ -246,10 +297,43 @@ def __init__(self, version=""):
self.header = "" # message added to the start of the output file.
self.footer = "" # message added to the end of the output file.

def read(self):
"""read a robots.txt File (TODO)"""
def read(self, robots_url):
"""read a robots.txt File"""

self.create_time = datetime.now()
robots_url = get_corrected_url(robots_url, "")
response = get_remote_content(robots_url)

if response.status_code < 400:
for ua_item in response.text.split("User-agent:"):
if ua_item:
ua_content_items = [
ua_split_item.strip() for ua_split_item in ua_item.split("\n") if ua_split_item
]
if not ua_content_items[0].startswith("#"):
ua = UserAgent(ua_name=ua_content_items[0])
ua.add_allow(
[
it.split("Allow:")[-1]
for it in ua_content_items[1:]
if it.startswith("Allow:")
]
)
ua.add_disallow(
[
it.split("Disallow:")[-1]
for it in ua_content_items[1:]
if it.startswith("Disallow:")
]
)
# TODO: Comments are not included Yet
comment = [
it.split("# ")[-1]
for it in ua_content_items[1:]
if it.startswith("#")
]

self.add_user_agent(ua=ua)

def write(self, file_path="robots.txt"):
"""write robots.txt file at a given file_path location.
Expand All @@ -268,15 +352,15 @@ def write(self, file_path="robots.txt"):
ua.consolidate()
f.write(ua.content)

f.write("\n\n")
f.write("\n")

# append ascii image, if available
if self.image_branding:
f.write(self.image_branding)

# append footer message
if self.footer:
f.write(f"\n\n# {self.footer}")
f.write(f"\n# {self.footer}")

def include_header(self, message="", append_date=True):
"""include header message with/without creation date.
Expand All @@ -286,7 +370,7 @@ def include_header(self, message="", append_date=True):
append_date (bool, optional): Append date/time to the header. Defaults to True.
"""

self.header = f"{message}"
self.header = message

if append_date:
self.header += f"\n# Created on {self.create_time} using pyrobotstxt"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
],
packages=["pyrobotstxt"],
python_requires=">=3.9",
install_requires=["pillow==9.3.0"],
install_requires=["pillow==9.3.0", "requests==2.28.2", "beautifulsoup4==4.11.2"],
extras_require={
"dev": [
"setuptools",
Expand Down

0 comments on commit 7da3eb2

Please sign in to comment.