TextInsight-Web-Data-Wizard/utils.py at main · Shankjbs571/TextInsight-Web-Data-Wizard · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import re
from nltk import word_tokenize
from nltk.corpus import cmudict

#@title p_n_scores, polarity_score,subjectivity_score functions
def p_n_scores(filtered_content,positive_words,negative_words):
  filtered_content_words = filtered_content.split()
  positive_score = 0
  negative_score = 0
  for word in filtered_content_words:
    if word in positive_words:
      positive_score+=1
    elif word in negative_words:
      negative_score+=1
  return positive_score,negative_score

def polarity_score(p,n):
  diff = p-n
  sum = p+n
  sum+=0.000001
  pol_score = diff/sum
  return pol_score

def subjectivity_score(p,n,total_words):
  sum = p+n
  total_words+=0.000001
  return sum/total_words


#@title Lets Find How many Complex words are there


def count_syllables(word, pronunciations):
    # Lookup the pronunciation of the word
    word_pronunciations = pronunciations.get(word.lower(), [])
    # word_pronunciations = pronunciations[word.lower()]
    max_syllables = 0

    # Iterate over each pronunciation
    for pron in word_pronunciations:
        syllable_count = 0
        # Iterate over each syllable in the pronunciation
        for syl in pron:
            # Check if the syllable ends with a digit
            if syl[-1].isdigit():
                syllable_count += 1
        # Update the maximum syllable count
        if syllable_count > max_syllables:
            max_syllables = syllable_count

    return max_syllables


def is_complex(word, pronunciations):
    return count_syllables(word, pronunciations) > 2

# Load the CMU Pronouncing Dictionary
pronunciations = cmudict.dict()


def complex_words_percentage(filtered_content,pronunciations):
  # Tokenize the text into words
  raw_words = word_tokenize(filtered_content)
  words = [word for word in raw_words if re.match('^[a-zA-Z]+$', word)]

  # Count the total number of words and complex words
  total_words = len(words)
  complex_words = sum([1 for word in words if is_complex(word, pronunciations)])

  # Calculate the percentage of complex words
  percentage_complex_words = (complex_words / total_words)*100
  return total_words,complex_words,percentage_complex_words


#@title Analysis of Readibility

from nltk import sent_tokenize


def fog_index(content,pronunciations):
  # Tokenize the text into sentences
  sentences = sent_tokenize(content)

  # Count the number of sentences
  num_sentences = len(sentences)
  # total_words,complex_words,complex_words_p = complex_words_percentage(content,pronunciations)
  total_words,complex_words,percentage_complex_words = complex_words_percentage(content,pronunciations)
  avg_sent_len = total_words / num_sentences
  # Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)
  fog_i = 0.4 * (avg_sent_len + percentage_complex_words)
  return fog_i , avg_sent_len,num_sentences


#@title Syllable Count Per Word
# Tokenize the text into words

def syllable_count_per_word(article,pronunciations):
  words = word_tokenize(article)

  # Calculate the total number of syllables and words
  total_syllables = sum(count_syllables(word,pronunciations) for word in words)
  total_words = len(words)

  # Calculate the syllable count per word
  if total_words > 0:
      syllables_per_word = total_syllables / total_words
  else:
      syllables_per_word = 0

  return total_syllables,total_words,syllables_per_word


#@title Count Personal Pronouns

def count_personal_pronouns(text):
    # Define the regex pattern to match the personal pronouns
    pattern = r"\b(?:I|we|my|ours|us)\b"

    # Use findall to get all matches of the pattern in the text
    matches = re.findall(pattern, text, flags=re.IGNORECASE)

    # Filter out 'US' as a country name
    filtered_matches = [match for match in matches if match.lower() != 'us']

    # Return the count of personal pronouns
    return len(filtered_matches)


#@title Average Word Length

def average_word_length(text):
    # Tokenize the text into words
    words = text.split()

    # Calculate the total number of characters in all words
    total_characters = sum(len(word) for word in words)

    # Calculate the total number of words
    total_words = len(words)

    # Calculate the average word length
    if total_words > 0:
        average_length = total_characters / total_words
    else:
        average_length = 0

    return average_length


#@title Get the stopwords removed

def remove_stop_words(article_content,stop_words):
  # Tokenize the article content
  words = article_content.split()

  # Remove stop words
  filtered_words = [word for word in words if word.lower() not in stop_words]

  # Join the filtered words back into a string
  filtered_content = ' '.join(filtered_words)

  return filtered_words,filtered_content