-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_analysis.py
135 lines (101 loc) · 4.94 KB
/
data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from data_scraping import scrape_data
import nltk
import os
import pandas as pd
from textstat import sentence_count
from nltk.tokenize import sent_tokenize
from nltk.corpus import cmudict
nltk.download('punkt')
nltk.download('cmudict')
positive_words = []
negative_words = []
# Load your own lists of positive and negative words
with open('./MasterDictionary/positive-words.txt', 'r') as f:
positive_words = f.read().splitlines()
with open('./MasterDictionary/negative-words.txt', 'r') as f:
negative_words = f.read().splitlines()
# Initialize an empty set to store the stop words
stop_words = set()
# Specify the directory where your stopword files are
directory = 'StopWords/'
# Loop over all files in the directory
for filename in os.listdir(directory):
# Only consider .csv files
if filename.endswith('.txt'):
with open(os.path.join(directory, filename), 'r') as f:
# Add the stop words from this file to the set
stop_words.update(f.read().splitlines())
# # Print the number of positive words and the first 10 positive words
# print(f"Number of positive words: {len(positive_words)}")
# print(f"First 10 positive words: {positive_words[:10]}")
# # Print the number of negative words and the first 10 negative words
# print(f"Number of negative words: {len(negative_words)}")
# print(f"First 10 negative words: {negative_words[:10]}")
# # Print the number of stop words and the first 10 stop words
# print(f"Number of stop words: {len(stop_words)}")
# print(f"First 10 stop words: {list(stop_words)[:10]}")
input_data = pd.read_excel("Input.xlsx")
d = cmudict.dict()
def nsyl(word):
try:
return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]]
except KeyError:
# Word not found in cmudict
return [0]
def calculate_metrics(text):
# Tokenize the text
tokens = nltk.word_tokenize(text)
# Remove stopwords
tokens = [token for token in tokens if token not in stop_words]
# Calculate word count
word_count = len(tokens)
# Calculate positive and negative scores
positive_score = sum(word in positive_words for word in tokens)
negative_score = sum(word in negative_words for word in tokens)
# Calculate polarity score
polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
# Calculate subjectivity score
subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)
# Calculate average sentence length
avg_sentence_length = word_count / sentence_count(text)
# Calculate percentage of complex words
complex_words = [word for word in tokens if word.isalpha() and nsyl(word)[0] > 2]
percentage_complex_words = len(complex_words) / word_count
# Calculate Fog Index
fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
# Calculate average number of words per sentence
avg_words_per_sentence = word_count / len(sent_tokenize(text))
# Calculate complex word count
complex_word_count = len(complex_words)
# Calculate syllable per word
syllable_per_word = sum(nsyl(word)[0] for word in tokens) / word_count
# Calculate personal pronouns
personal_pronouns = sum(1 for word in tokens if word.lower() in ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'])
# Calculate average word length
avg_word_length = sum(len(word) for word in tokens) / word_count
return polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count, syllable_per_word, personal_pronouns, avg_word_length
# Read the existing data
output_data = pd.read_excel('Output Data Structure.xlsx')
# Loop through the URLs and scrape data
for index, row in input_data.iterrows():
url = row['URL']
title, article_text = scrape_data(url)
# Calculate metrics
metrics = calculate_metrics(article_text)
# Print metrics
print(f'URL: {url}\nTitle: {title}\nMetrics: {metrics}\n')
# Add metrics to output data
# output_data.loc[index, 'Title'] = title
output_data.loc[index, 'Polarity Score'] = metrics[0]
output_data.loc[index, 'Subjectivity Score'] = metrics[1]
output_data.loc[index, 'Avg Sentence Length'] = metrics[2]
output_data.loc[index, 'Percentage Complex Words'] = metrics[3]
output_data.loc[index, 'Fog Index'] = metrics[4]
output_data.loc[index, 'Avg Words per Sentence'] = metrics[5]
output_data.loc[index, 'Complex Word Count'] = metrics[6]
output_data.loc[index, 'Word Count'] = metrics[7]
output_data.loc[index, 'Syllable per Word'] = metrics[8]
output_data.loc[index, 'Personal Pronouns'] = metrics[9]
output_data.loc[index, 'Avg Word Length'] = metrics[10]
# Write output data back to Excel file
output_data.to_excel('Output Data Structure.xlsx', index=False)