This repository has been archived by the owner on Aug 4, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
search_keywords.py
147 lines (113 loc) · 4.54 KB
/
search_keywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import argparse
import sys
import configparser
import feedparser
import datetime
import delorean
import requests
from bs4 import BeautifulSoup
import mistune
import jinja2
from collections import namedtuple
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
# Group the email configuration parameters
# Note the 'from_' to avoid using a reserved Python keyword (from)
EmailConfig = namedtuple('EmailConfig', ['user', 'password', 'from_', 'to'])
# Get the email templates from hard disk
EMAIL_TEMPLATE_FILE = 'email_template.md'
EMAIL_STYLING_FILE = 'email_styling.html'
with open(EMAIL_TEMPLATE_FILE) as md_file:
EMAIL_TEMPLATE = md_file.read()
with open(EMAIL_STYLING_FILE) as html_file:
EMAIL_STYLING = html_file.read()
def get_articles(keywords, feeds):
'''
Retrieve a list of articles from the feeds that contain the keywords
Each article is returned in the format:
(title, summary, link)
'''
articles = []
for feed in feeds:
rss = feedparser.parse(feed)
updated_time = rss.get('updated', str(datetime.datetime.utcnow()))
time_limit = delorean.parse(updated_time) - datetime.timedelta(days=7)
for entry in rss.entries:
# Normalise the time
entry_time = delorean.parse(entry.published)
entry_time.shift('UTC')
if entry_time < time_limit:
# Skip this entry
continue
# Get the article
response = requests.get(entry.link)
article = BeautifulSoup(response.text, 'html.parser')
article_reference = (article.title.string.strip(),
entry.summary.strip(),
entry.link)
article_text = article.get_text()
for keyword in keywords:
if keyword.lower() in article_text.lower():
articles.append(article_reference)
break
return articles
def compose_email_body(articles, keywords, feed_list):
'''
From the list of articles, keywords and feeds, fill the email template
Set the list in the adequate format for the template
'''
# Compose the list of articles
ARTICLE_TEMPLATE = '* **{title}** {summary}: {link}'
article_list = [ARTICLE_TEMPLATE.format(title=title, summary=summary,
link=link)
for title, summary, link in articles]
data = {
'article_list': '\n'.join(article_list),
'keywords': ', '.join(keywords),
'feed_list': ', '.join(feed_list),
}
text = EMAIL_TEMPLATE.format(**data)
html_content = mistune.markdown(text)
html = jinja2.Template(EMAIL_STYLING).render(content=html_content)
return text, html
def send_email(email_config, text_body, html_body):
'''
Send an email with the text and html body, using the parameters
configured in email_config
'''
msg = MIMEMultipart('alternative')
msg['Subject'] = 'Weekly report'
msg['From'] = email_config.from_
msg['To'] = email_config.to
part_plain = MIMEText(text_body, 'plain')
part_html = MIMEText(html_body, 'html')
msg.attach(part_plain)
msg.attach(part_html)
with smtplib.SMTP('smtp.gmail.com', 587) as server:
server.starttls()
server.login(email_config.user, email_config.password)
server.sendmail(email_config.from_, [email_config.to], msg.as_string())
def main(keywords, feeds, email_config):
articles = get_articles(keywords, feeds)
text_body, html_body = compose_email_body(articles, keywords, feeds)
send_email(email_config, text_body, html_body)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(type=argparse.FileType('r'), dest='config',
help='config file')
parser.add_argument('-o', dest='output', type=argparse.FileType('w'),
help='output file',
default=sys.stdout)
args = parser.parse_args()
config = configparser.ConfigParser()
config.read_file(args.config)
keywords = config['SEARCH']['keywords'].split(',')
feeds = [feed.strip() for feed in config['SEARCH']['feeds'].split(',')]
email_user = config['EMAIL']['user']
email_password = config['EMAIL']['password']
email_from = config['EMAIL']['from']
email_to = config['EMAIL']['to']
email_config = EmailConfig(email_user, email_password, email_from,
email_to)
main(keywords, feeds, email_config)