Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 37 additions & 37 deletions consopt.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,21 @@ import (
// ConsOpt is a construction option for manual creation of a Corpus
type ConsOpt func(c *Corpus) error

// WithWords creates a corpus from a word list. It may have repeated words
// WithWords creates a corpus from a word list. It may have repeated Words
func WithWords(a []string) ConsOpt {
f := func(c *Corpus) error {
s := set.Strings(a)
c.words = s
c.frequencies = make([]int, len(s))
c.Words = s
c.Frequencies = make([]int, len(s))

ids := make(map[string]int)
maxID := len(s)

var totalFreq, maxWL int
// NOTE: here we're iterating over the set of words
// NOTE: here we're iterating over the set of Words
for i, w := range s {
runeCount := utf8.RuneCountInString(w)
if runeCount > c.maxWordLength {
if runeCount > c.MaxWordLength_ {
maxWL = runeCount
}

Expand All @@ -35,14 +35,14 @@ func WithWords(a []string) ConsOpt {

// NOTE: here we're iterating over the original word list.
for _, w := range a {
c.frequencies[ids[w]]++
c.Frequencies[ids[w]]++
totalFreq++
}

c.ids = ids
atomic.AddInt64(&c.maxid, int64(maxID))
c.totalFreq = totalFreq
c.maxWordLength = maxWL
c.Ids = ids
atomic.AddInt64(&c.MaxID, int64(maxID))
c.TotalWordFreq = totalFreq
c.MaxWordLength_ = maxWL
return nil
}
return f
Expand All @@ -52,10 +52,10 @@ func WithWords(a []string) ConsOpt {
func WithOrderedWords(a []string) ConsOpt {
f := func(c *Corpus) error {
s := a
c.words = s
c.frequencies = make([]int, len(s))
for i := range c.frequencies {
c.frequencies[i] = 1
c.Words = s
c.Frequencies = make([]int, len(s))
for i := range c.Frequencies {
c.Frequencies[i] = 1
}

ids := make(map[string]int)
Expand All @@ -64,16 +64,16 @@ func WithOrderedWords(a []string) ConsOpt {
var maxWL int
for i, w := range a {
runeCount := utf8.RuneCountInString(w)
if runeCount > c.maxWordLength {
if runeCount > c.MaxWordLength_ {
maxWL = runeCount
}
ids[w] = i
}

c.ids = ids
atomic.AddInt64(&c.maxid, int64(maxID))
c.totalFreq = totalFreq
c.maxWordLength = maxWL
c.Ids = ids
atomic.AddInt64(&c.MaxID, int64(maxID))
c.TotalWordFreq = totalFreq
c.MaxWordLength_ = maxWL
return nil
}
return f
Expand All @@ -82,8 +82,8 @@ func WithOrderedWords(a []string) ConsOpt {
// WithSize preallocates all the things in Corpus
func WithSize(size int) ConsOpt {
return func(c *Corpus) error {
c.words = make([]string, 0, size)
c.frequencies = make([]int, 0, size)
c.Words = make([]string, 0, size)
c.Frequencies = make([]int, 0, size)
return nil
}
}
Expand All @@ -98,22 +98,22 @@ func FromDict(d map[string]int) ConsOpt {
a.ids = append(a.ids, v)
}
sort.Sort(&a)
c.ids = make(map[string]int)
c.Ids = make(map[string]int)
for i, w := range a.words {
if i != a.ids[i] {
return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i])
}
c.words = append(c.words, w)
c.frequencies = append(c.frequencies, 1)
c.ids[w] = i
c.Words = append(c.Words, w)
c.Frequencies = append(c.Frequencies, 1)
c.Ids[w] = i

c.totalFreq++
c.TotalWordFreq++
runeCount := utf8.RuneCountInString(w)
if runeCount > c.maxWordLength {
c.maxWordLength = runeCount
if runeCount > c.MaxWordLength_ {
c.MaxWordLength_ = runeCount
}
}
c.maxid = int64(len(a.words))
c.MaxID = int64(len(a.words))
return nil
}

Expand All @@ -129,22 +129,22 @@ func FromDictWithFreq(d map[string]struct{ ID, Freq int }) ConsOpt {
a.freqs = append(a.freqs, v.Freq)
}
sort.Sort(&a)
c.ids = make(map[string]int)
c.Ids = make(map[string]int)
for i, w := range a.words {
if i != a.ids[i] {
return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i])
}
c.words = append(c.words, w)
c.frequencies = append(c.frequencies, a.freqs[i])
c.ids[w] = i
c.Words = append(c.Words, w)
c.Frequencies = append(c.Frequencies, a.freqs[i])
c.Ids[w] = i

c.totalFreq += a.freqs[i]
c.TotalWordFreq += a.freqs[i]
runeCount := utf8.RuneCountInString(w)
if runeCount > c.maxWordLength {
c.maxWordLength = runeCount
if runeCount > c.MaxWordLength_ {
c.MaxWordLength_ = runeCount
}
}
c.maxid = int64(len(a.words))
c.MaxID = int64(len(a.words))
return nil
}
}
114 changes: 57 additions & 57 deletions corpus.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,30 @@ import (
// Corpus is a data structure holding the relevant metadata and information for a corpus of text.
// It serves as vocabulary with ID for lookup. This is very useful as neural networks rely on the IDs rather than the text themselves
type Corpus struct {
words []string
frequencies []int
Words []string `json:"words"`
Frequencies []int `json:"frequencies"`

ids map[string]int
Ids map[string]int `json:"ids"`

// atomic read and write plz
maxid int64
totalFreq int
maxWordLength int
MaxID int64 `json:"max_id"`
TotalWordFreq int `json:"total_word_freq"`
MaxWordLength_ int `json:"max_word_length"`
}

// New creates a new *Corpus
func New() *Corpus {
c := &Corpus{
words: make([]string, 0),
frequencies: make([]int, 0),
ids: make(map[string]int),
Words: make([]string, 0),
Frequencies: make([]int, 0),
Ids: make(map[string]int),
}

// add some default words
c.Add("") // aka NULL - when there are no words
// add some default Words
c.Add("") // aka NULL - when there are no Words
c.Add("-UNKNOWN-")
c.Add("-ROOT-")
c.maxWordLength = 0 // specials don't have lengths
c.MaxWordLength_ = 0 // specials don't have lengths

return c
}
Expand All @@ -43,14 +43,14 @@ func Construct(opts ...ConsOpt) (*Corpus, error) {
c := new(Corpus)

// checks
if c.words == nil {
c.words = make([]string, 0)
if c.Words == nil {
c.Words = make([]string, 0)
}
if c.frequencies == nil {
c.frequencies = make([]int, 0)
if c.Frequencies == nil {
c.Frequencies = make([]int, 0)
}
if c.ids == nil {
c.ids = make(map[string]int)
if c.Ids == nil {
c.Ids = make(map[string]int)
}

for _, opt := range opts {
Expand All @@ -62,80 +62,80 @@ func Construct(opts ...ConsOpt) (*Corpus, error) {
return c, nil
}

// ID returns the ID of a word and whether or not it was found in the corpus
// Id returns the ID of a word and whether or not it was found in the corpus
func (c *Corpus) Id(word string) (int, bool) {
id, ok := c.ids[word]
id, ok := c.Ids[word]
return id, ok
}

// Word returns the word given the ID, and whether or not it was found in the corpus
func (c *Corpus) Word(id int) (string, bool) {
size := atomic.LoadInt64(&c.maxid)
size := atomic.LoadInt64(&c.MaxID)
maxid := int(size)

if id >= maxid {
return "", false
}
return c.words[id], true
return c.Words[id], true
}

// Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID
func (c *Corpus) Add(word string) int {
if id, ok := c.ids[word]; ok {
c.frequencies[id]++
c.totalFreq++
if id, ok := c.Ids[word]; ok {
c.Frequencies[id]++
c.TotalWordFreq++
return id
}

id := atomic.AddInt64(&c.maxid, 1)
c.ids[word] = int(id - 1)
c.words = append(c.words, word)
c.frequencies = append(c.frequencies, 1)
c.totalFreq++
id := atomic.AddInt64(&c.MaxID, 1)
c.Ids[word] = int(id - 1)
c.Words = append(c.Words, word)
c.Frequencies = append(c.Frequencies, 1)
c.TotalWordFreq++

runeCount := utf8.RuneCountInString(word)
if runeCount > c.maxWordLength {
c.maxWordLength = runeCount
if runeCount > c.MaxWordLength_ {
c.MaxWordLength_ = runeCount
}

return int(id - 1)
}

// Size returns the size of the corpus.
func (c *Corpus) Size() int {
size := atomic.LoadInt64(&c.maxid)
size := atomic.LoadInt64(&c.MaxID)
return int(size)
}

// WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0.
func (c *Corpus) WordFreq(word string) int {
id, ok := c.ids[word]
id, ok := c.Ids[word]
if !ok {
return 0
}

return c.frequencies[id]
return c.Frequencies[id]
}

// IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0.
func (c *Corpus) IDFreq(id int) int {
size := atomic.LoadInt64(&c.maxid)
size := atomic.LoadInt64(&c.MaxID)
maxid := int(size)

if id >= maxid {
return 0
}
return c.frequencies[id]
return c.Frequencies[id]
}

// TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words.
// TotalFreq returns the total number of Words ever seen by the corpus. This number includes the count of repeat Words.
func (c *Corpus) TotalFreq() int {
return c.totalFreq
return c.TotalWordFreq
}

// MaxWordLength returns the length of the longest known word in the corpus.
func (c *Corpus) MaxWordLength() int {
return c.maxWordLength
return c.MaxWordLength_
}

// WordProb returns the probability of a word appearing in the corpus.
Expand All @@ -145,22 +145,22 @@ func (c *Corpus) WordProb(word string) (float64, bool) {
return 0, false
}

count := c.frequencies[id]
return float64(count) / float64(c.totalFreq), true
count := c.Frequencies[id]
return float64(count) / float64(c.TotalWordFreq), true

}

// Merge combines two corpuses. The receiver is the one that is mutated.
func (c *Corpus) Merge(other *Corpus) {
for i, word := range other.words {
freq := other.frequencies[i]
if id, ok := c.ids[word]; ok {
c.frequencies[id] += freq
c.totalFreq += freq
for i, word := range other.Words {
freq := other.Frequencies[i]
if id, ok := c.Ids[word]; ok {
c.Frequencies[id] += freq
c.TotalWordFreq += freq
} else {
id := c.Add(word)
c.frequencies[id] += freq - 1
c.totalFreq += freq - 1
c.Frequencies[id] += freq - 1
c.TotalWordFreq += freq - 1
}
}
}
Expand All @@ -170,28 +170,28 @@ func (c *Corpus) Merge(other *Corpus) {
// e.g: c.Replace("foo", "bar")
// c.Id("foo") will still return a ID. The ID will be the same as c.Id("bar")
func (c *Corpus) Replace(a, with string) error {
old, ok := c.ids[a]
old, ok := c.Ids[a]
if !ok {
return errors.Errorf("Cannot replace %q with %q. %q is not found", a, with, a)
}
if _, ok := c.ids[with]; ok {
if _, ok := c.Ids[with]; ok {
return errors.Errorf("Cannot replace %q with %q. %q exists in the corpus", a, with, with)
}
c.words[old] = with
c.ids[with] = old
c.Words[old] = with
c.Ids[with] = old
return nil

}

// ReplaceWord replaces the word associated with the given ID. The old reference remains.
func (c *Corpus) ReplaceWord(id int, with string) error {
if id >= len(c.words) {
if id >= len(c.Words) {
return errors.Errorf("Cannot replace word with ID %d. Out of bounds.", id)
}
if _, ok := c.ids[with]; ok {
if _, ok := c.Ids[with]; ok {
return errors.Errorf("Cannot replace word with ID %d with %q. %q exists in the corpus", id, with, with)
}
c.words[id] = with
c.ids[with] = id
c.Words[id] = with
c.Ids[with] = id
return nil
}
Loading