Mini-Python-Projects/languageModel.py at master · nidhi2509/Mini-Python-Projects · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import nltk
class LanguageModel:
	def __init__(self, textFileList):
		bigramList = []
		N_c
		vocabSize
		Vocab
		vocabCount


	def smoothed_count(self, bigram):
		#input = bigram
		#return smoothed count

	def build_ngram_data(self, textFileList): #model file
		#tokenize
		punct = r"(['.,\?!``()\";'':/|`])"
		listOfTokenizedFiles = []

    for textFile in textFileList:   #open every file
    	open(textFile, 'r')
    	toks = nltk.word_tokenize(textFile)
      listOfTokenizedFiles.append(toks) #list of lists containing unigrams from model file
      for m in listOfTokenizedFiles:
      	if m in punct:
        	listOfTokenizedFiles.remove(m) #exclude punctuations

		#dictionary or count of the unigrams

		for item in listOfTokenizedFiles:


	def get_logProb(self, tokenList):


	def get_ngram_logProb(slef, ngram):