-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
78 lines (54 loc) · 2.24 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
from bs4 import BeautifulSoup
from collections import Counter
import pandas as pd
import openpyxl
from exclude import exclude
def get_keywords(url):
# Make a GET request to the website
page = requests.get(url)
# Parsing page content
soup = BeautifulSoup(page.content, 'html.parser')
# Get the text from the website
text = ' '.join(map(lambda p: p.text, soup.find_all(['p','h1','h2','h3','h4','h5','h6'])))
# Split the text into words
all_words = text.split()
# Convert words to lowercase
all_words = [word.lower() for word in all_words]
# Get rid of "."
all_words = [word.replace(".", "") for word in all_words]
# Get rid of ","
all_words = [word.replace(",", "") for word in all_words]
# Get rid of "?"
all_words = [word.replace("?", "") for word in all_words]
# Get rid of "!"
all_words = [word.replace("!", "") for word in all_words]
# Get rid of ":"
all_words = [word.replace(":", "") for word in all_words]
# Get rid of " " "
all_words = [word.replace("\"", "") for word in all_words]
# Get rid of "("
all_words = [word.replace("(", "") for word in all_words]
# Get rid of ")"
all_words = [word.replace("(", "") for word in all_words]
# Get rid of "["
all_words = [word.replace("[", "") for word in all_words]
# Get rid of "]"
all_words = [word.replace("]", "") for word in all_words]
# Get rid of words that are listed in our exclude list
words = [word for word in all_words if word.lower() not in exclude]
# Count the frequency of each word
word_count = Counter(words)
# Find the percentage of each word
total_words = len(words)
for word in word_count:
word_count[word] = str(round((word_count[word] / total_words) * 100, 2)) + "%"
# Create a dataframe to store the keywords and their percentages
df = pd.DataFrame(word_count.items(), columns=['Keyword', 'Percentage'])
df = df.sort_values(by='Percentage', ascending=False)
# Save table in excel
df.to_excel("output.xlsx", index=False)
return df
# Example usage
url = 'https://en.wikipedia.org/wiki/ELISA'
keywords_df = get_keywords(url)