-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
160 lines (116 loc) · 4.73 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import re
import time
import asyncio
import aiohttp
import datetime
from typing import List
from utils import BASE_URL
from dal import is_duplicate, create_record_for_movies
# Careful! Just run this module when you want crawl movies.
async def fetch(session: aiohttp.ClientSession, url: str):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
async with session.get(url, headers=headers) as response:
return await response.text()
async def movie_crawler(session: aiohttp.ClientSession, start_page: int, end_page: int) -> list:
"""
Get two args as start_page and end_page, send a get request to the base url
then with regex find all movie_links
Args:
start_page: int
end_page: int
Returns:
list(str)
"""
tasks = []
for page in range(start_page, end_page + 1):
task = asyncio.create_task(fetch(session, f'{BASE_URL}/page/{page}'))
tasks.append(task)
await asyncio.sleep(2)
pages_content = await asyncio.gather(*tasks)
all_links = []
for content in pages_content:
links = re.findall(f'{BASE_URL}/\d+/[\w-]+/', content)
all_links.extend(list(set(links)))
return all_links
async def series_crawler(session: aiohttp.ClientSession, start_page: int, end_page: int) -> list:
"""
Get two args as start_page and end_page, send a get request to the base url
then with regex find all series_link
Args:
start_page: int
end_page: int
Returns:
list(str)
"""
tasks = []
for page in range(start_page, end_page + 1):
task = asyncio.create_task(fetch(session, f'{BASE_URL}/category/دانلود-سریال/page/{page}'))
tasks.append(task)
await asyncio.sleep(2)
pages_content = await asyncio.gather(*tasks)
all_links = []
for content in pages_content:
links = re.findall(f'{BASE_URL}/\d+/[\w-]+/', content)
all_links.extend(list(set(links)))
return all_links
def remove_duplicate(movie_list: List[str]) -> List[str]:
"""
Get a single arg as movie_list, iterate on each movie and append them to new list
finally we create a list of set to remove duplicate movies-series.
Args:
movie_list: list
Returns:
list(str)
"""
return list(set(movie_list))
def ready_for_insert(movies: list) -> tuple:
"""
Get a single arg as movies(url of movies), iterate on them
get_data from urls --> movie_name, published_date ...
call is_duplicate function for each movie to check is that exists in db or not!
then check if there is a date in url or not, and finally call create_record_for_movies() function
return duplicate_movies count and new_movies crawled.
Args:
movies: list
Returns:
tuple(duplicate_counter: int, crawled_data: int)
"""
duplicate_counter = 0
movies_data = list()
movies_with_published_date = list()
for data in movies:
url = data
movie_name = data.split('/')
movie_name = movie_name[-2]
if is_duplicate(movie_name):
duplicate_counter += 1
continue
published_date:str = movie_name.split('-')
published_date = published_date[-1]
if published_date.isnumeric() and len(published_date) == 4:
info = (url, movie_name, published_date)
movies_with_published_date.append(info)
else:
data = (url, movie_name)
movies_data.append(data)
crawled_counter = len(movies_data) + len(movies_with_published_date)
create_record_for_movies(movies_data)
create_record_for_movies(movies_with_published_date, has_published_date=True)
return duplicate_counter, crawled_counter
async def main():
async with aiohttp.ClientSession() as session:
start_time = time.time()
start_page = 1
end_page = 3
crawled_movies = await movie_crawler(session, start_page, end_page)
crawled_series = await series_crawler(session, start_page, end_page)
movies = remove_duplicate(crawled_movies)
series = remove_duplicate(crawled_series)
insert_movies = ready_for_insert(movies)
insert_series = ready_for_insert(series)
end_time = time.time()
consumed_time = end_time - start_time
with open('crawl.log', 'a') as f:
f.write(f'Crawled Successfully on {datetime.datetime.now()}\n{insert_movies[1] + insert_series[1]} New Movies and Series\n{insert_movies[0]} duplicate movies found.\n{insert_series[0]} duplicate series found.\nConsumed Time -> {consumed_time:.2f} seconds!\n\n')
if __name__ == '__main__':
asyncio.run(main())