-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
95 lines (77 loc) · 2.95 KB
/
main.py
File metadata and controls
95 lines (77 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import hunspell
from bs4 import BeautifulSoup
import requests
import re
from flask import Flask, render_template, request
import csv, nltk
app = Flask(__name__)
spellchecker = hunspell.HunSpell(
"./marathi_words_updates.oxt_FILES/dicts/mr_IN.dic",
"./marathi_words_updates.oxt_FILES/dicts/mr_IN.aff",
)
spellchecker_split = hunspell.HunSpell(
"./marathi_words_updates.oxt_FILES/dicts/mr_IN.dic",
"./marathi_words_updates.oxt_FILES/dicts/mr_IN_split.aff",
)
words = list()
@app.route('/')
def index():
return render_template("index.html")
from collections import defaultdict
mydict = defaultdict(list)
with open("marathi_bigram_count.txt", newline='') as f:
for row in csv.reader(f, delimiter = ' '):
mydict[row[0].strip()].append(row[1].strip())
def mycheck(myword):
matches = re.findall('[१२३४५६७८९०1234567890]', myword[1])
if spellchecker.spell(myword[1]) is False and len(myword[1]) > 2 and len(matches) < 1:
try:
if len(myword[1]) > 12:
word_result = {
'original_word': myword[1],
'corrected_word': spellchecker_split.suggest(myword[1])
}
else:
word_result = {
'original_word': myword[1],
'corrected_word': spellchecker.suggest(myword[1])
}
result = mydict[myword[0]]
list_one_updated = list()
for i in word_result['corrected_word']:
if i in result:
list_one_updated.append(i)
for i in word_result['corrected_word']:
if i not in result:
list_one_updated.append(i)
words.append({'original_word': myword[1], 'corrected_word': list_one_updated[0]})
return
except:
pass
@app.route('/process', methods=['POST', 'GET'])
def process():
if request.method == 'POST':
from collections import defaultdict
mydict = defaultdict(list)
with open("marathi_bigram_count.txt", newline='') as f:
for row in csv.reader(f, delimiter = ' '):
mydict[row[0].strip()].append(row[1].strip())
url = request.form['url']
print(url)
headers = requests.utils.default_headers()
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, "html.parser")
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
words.clear()
p = re.compile(r"[^\u0900-\u097F\n]")
for line in text.splitlines():
cleaned = p.sub(" ", line)
if cleaned.strip():
mycheck(('NULL', cleaned.split()[0]))
for i in nltk.bigrams(cleaned.split()):
mycheck(i)
return render_template("success.html", words=words)
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0')