-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_analyze.py
More file actions
30 lines (27 loc) · 1.16 KB
/
text_analyze.py
File metadata and controls
30 lines (27 loc) · 1.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from datetime import datetime
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
from explict_check import predict_prob
from db_helper import upsertInSentenceScores
def processTextAndSaveDB(pageText, crawledPageId):
soup = BeautifulSoup(pageText, 'html.parser')
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'div']
paragraph = ' '.join([tag.text for tag in soup.findAll(TAGS)])
tokenizedParagraph = ' '.join(word_tokenize(paragraph))
sentences = sent_tokenize(tokenizedParagraph)
pageSentenceScores = list()
windowSize = 500
for i, j in enumerate(range(0, len(sentences), windowSize)):
selected = sentences[j: j + windowSize]
scales = predict_prob(selected)
for index, sentence in enumerate(selected):
pageSentenceScore = {
'crawledPageId': crawledPageId,
'text': sentence,
'score': scales[index],
'updatedAt': datetime.now()
}
pageSentenceScores.append(pageSentenceScore)
if len(pageSentenceScores):
upsertInSentenceScores(pageSentenceScores)