From e281eae8b359980f915d9c45e999ca31bf3fd07e Mon Sep 17 00:00:00 2001 From: Florent Aide Date: Fri, 3 Nov 2023 14:26:57 +0100 Subject: [PATCH] add tags to allow easy csv export --- consopt.go | 74 +++++++++++++++--------------- corpus.go | 114 +++++++++++++++++++++++----------------------- corpus_test.go | 16 +++---- functions.go | 6 +-- functions_test.go | 4 +- io.go | 44 +++++++++--------- io_test.go | 18 ++++---- 7 files changed, 138 insertions(+), 138 deletions(-) diff --git a/consopt.go b/consopt.go index ab7b85d..e727724 100644 --- a/consopt.go +++ b/consopt.go @@ -12,21 +12,21 @@ import ( // ConsOpt is a construction option for manual creation of a Corpus type ConsOpt func(c *Corpus) error -// WithWords creates a corpus from a word list. It may have repeated words +// WithWords creates a corpus from a word list. It may have repeated Words func WithWords(a []string) ConsOpt { f := func(c *Corpus) error { s := set.Strings(a) - c.words = s - c.frequencies = make([]int, len(s)) + c.Words = s + c.Frequencies = make([]int, len(s)) ids := make(map[string]int) maxID := len(s) var totalFreq, maxWL int - // NOTE: here we're iterating over the set of words + // NOTE: here we're iterating over the set of Words for i, w := range s { runeCount := utf8.RuneCountInString(w) - if runeCount > c.maxWordLength { + if runeCount > c.MaxWordLength_ { maxWL = runeCount } @@ -35,14 +35,14 @@ func WithWords(a []string) ConsOpt { // NOTE: here we're iterating over the original word list. for _, w := range a { - c.frequencies[ids[w]]++ + c.Frequencies[ids[w]]++ totalFreq++ } - c.ids = ids - atomic.AddInt64(&c.maxid, int64(maxID)) - c.totalFreq = totalFreq - c.maxWordLength = maxWL + c.Ids = ids + atomic.AddInt64(&c.MaxID, int64(maxID)) + c.TotalWordFreq = totalFreq + c.MaxWordLength_ = maxWL return nil } return f @@ -52,10 +52,10 @@ func WithWords(a []string) ConsOpt { func WithOrderedWords(a []string) ConsOpt { f := func(c *Corpus) error { s := a - c.words = s - c.frequencies = make([]int, len(s)) - for i := range c.frequencies { - c.frequencies[i] = 1 + c.Words = s + c.Frequencies = make([]int, len(s)) + for i := range c.Frequencies { + c.Frequencies[i] = 1 } ids := make(map[string]int) @@ -64,16 +64,16 @@ func WithOrderedWords(a []string) ConsOpt { var maxWL int for i, w := range a { runeCount := utf8.RuneCountInString(w) - if runeCount > c.maxWordLength { + if runeCount > c.MaxWordLength_ { maxWL = runeCount } ids[w] = i } - c.ids = ids - atomic.AddInt64(&c.maxid, int64(maxID)) - c.totalFreq = totalFreq - c.maxWordLength = maxWL + c.Ids = ids + atomic.AddInt64(&c.MaxID, int64(maxID)) + c.TotalWordFreq = totalFreq + c.MaxWordLength_ = maxWL return nil } return f @@ -82,8 +82,8 @@ func WithOrderedWords(a []string) ConsOpt { // WithSize preallocates all the things in Corpus func WithSize(size int) ConsOpt { return func(c *Corpus) error { - c.words = make([]string, 0, size) - c.frequencies = make([]int, 0, size) + c.Words = make([]string, 0, size) + c.Frequencies = make([]int, 0, size) return nil } } @@ -98,22 +98,22 @@ func FromDict(d map[string]int) ConsOpt { a.ids = append(a.ids, v) } sort.Sort(&a) - c.ids = make(map[string]int) + c.Ids = make(map[string]int) for i, w := range a.words { if i != a.ids[i] { return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i]) } - c.words = append(c.words, w) - c.frequencies = append(c.frequencies, 1) - c.ids[w] = i + c.Words = append(c.Words, w) + c.Frequencies = append(c.Frequencies, 1) + c.Ids[w] = i - c.totalFreq++ + c.TotalWordFreq++ runeCount := utf8.RuneCountInString(w) - if runeCount > c.maxWordLength { - c.maxWordLength = runeCount + if runeCount > c.MaxWordLength_ { + c.MaxWordLength_ = runeCount } } - c.maxid = int64(len(a.words)) + c.MaxID = int64(len(a.words)) return nil } @@ -129,22 +129,22 @@ func FromDictWithFreq(d map[string]struct{ ID, Freq int }) ConsOpt { a.freqs = append(a.freqs, v.Freq) } sort.Sort(&a) - c.ids = make(map[string]int) + c.Ids = make(map[string]int) for i, w := range a.words { if i != a.ids[i] { return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i]) } - c.words = append(c.words, w) - c.frequencies = append(c.frequencies, a.freqs[i]) - c.ids[w] = i + c.Words = append(c.Words, w) + c.Frequencies = append(c.Frequencies, a.freqs[i]) + c.Ids[w] = i - c.totalFreq += a.freqs[i] + c.TotalWordFreq += a.freqs[i] runeCount := utf8.RuneCountInString(w) - if runeCount > c.maxWordLength { - c.maxWordLength = runeCount + if runeCount > c.MaxWordLength_ { + c.MaxWordLength_ = runeCount } } - c.maxid = int64(len(a.words)) + c.MaxID = int64(len(a.words)) return nil } } diff --git a/corpus.go b/corpus.go index 6ed1c41..35b5ec3 100644 --- a/corpus.go +++ b/corpus.go @@ -10,30 +10,30 @@ import ( // Corpus is a data structure holding the relevant metadata and information for a corpus of text. // It serves as vocabulary with ID for lookup. This is very useful as neural networks rely on the IDs rather than the text themselves type Corpus struct { - words []string - frequencies []int + Words []string `json:"words"` + Frequencies []int `json:"frequencies"` - ids map[string]int + Ids map[string]int `json:"ids"` // atomic read and write plz - maxid int64 - totalFreq int - maxWordLength int + MaxID int64 `json:"max_id"` + TotalWordFreq int `json:"total_word_freq"` + MaxWordLength_ int `json:"max_word_length"` } // New creates a new *Corpus func New() *Corpus { c := &Corpus{ - words: make([]string, 0), - frequencies: make([]int, 0), - ids: make(map[string]int), + Words: make([]string, 0), + Frequencies: make([]int, 0), + Ids: make(map[string]int), } - // add some default words - c.Add("") // aka NULL - when there are no words + // add some default Words + c.Add("") // aka NULL - when there are no Words c.Add("-UNKNOWN-") c.Add("-ROOT-") - c.maxWordLength = 0 // specials don't have lengths + c.MaxWordLength_ = 0 // specials don't have lengths return c } @@ -43,14 +43,14 @@ func Construct(opts ...ConsOpt) (*Corpus, error) { c := new(Corpus) // checks - if c.words == nil { - c.words = make([]string, 0) + if c.Words == nil { + c.Words = make([]string, 0) } - if c.frequencies == nil { - c.frequencies = make([]int, 0) + if c.Frequencies == nil { + c.Frequencies = make([]int, 0) } - if c.ids == nil { - c.ids = make(map[string]int) + if c.Ids == nil { + c.Ids = make(map[string]int) } for _, opt := range opts { @@ -62,40 +62,40 @@ func Construct(opts ...ConsOpt) (*Corpus, error) { return c, nil } -// ID returns the ID of a word and whether or not it was found in the corpus +// Id returns the ID of a word and whether or not it was found in the corpus func (c *Corpus) Id(word string) (int, bool) { - id, ok := c.ids[word] + id, ok := c.Ids[word] return id, ok } // Word returns the word given the ID, and whether or not it was found in the corpus func (c *Corpus) Word(id int) (string, bool) { - size := atomic.LoadInt64(&c.maxid) + size := atomic.LoadInt64(&c.MaxID) maxid := int(size) if id >= maxid { return "", false } - return c.words[id], true + return c.Words[id], true } // Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID func (c *Corpus) Add(word string) int { - if id, ok := c.ids[word]; ok { - c.frequencies[id]++ - c.totalFreq++ + if id, ok := c.Ids[word]; ok { + c.Frequencies[id]++ + c.TotalWordFreq++ return id } - id := atomic.AddInt64(&c.maxid, 1) - c.ids[word] = int(id - 1) - c.words = append(c.words, word) - c.frequencies = append(c.frequencies, 1) - c.totalFreq++ + id := atomic.AddInt64(&c.MaxID, 1) + c.Ids[word] = int(id - 1) + c.Words = append(c.Words, word) + c.Frequencies = append(c.Frequencies, 1) + c.TotalWordFreq++ runeCount := utf8.RuneCountInString(word) - if runeCount > c.maxWordLength { - c.maxWordLength = runeCount + if runeCount > c.MaxWordLength_ { + c.MaxWordLength_ = runeCount } return int(id - 1) @@ -103,39 +103,39 @@ func (c *Corpus) Add(word string) int { // Size returns the size of the corpus. func (c *Corpus) Size() int { - size := atomic.LoadInt64(&c.maxid) + size := atomic.LoadInt64(&c.MaxID) return int(size) } // WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0. func (c *Corpus) WordFreq(word string) int { - id, ok := c.ids[word] + id, ok := c.Ids[word] if !ok { return 0 } - return c.frequencies[id] + return c.Frequencies[id] } // IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0. func (c *Corpus) IDFreq(id int) int { - size := atomic.LoadInt64(&c.maxid) + size := atomic.LoadInt64(&c.MaxID) maxid := int(size) if id >= maxid { return 0 } - return c.frequencies[id] + return c.Frequencies[id] } -// TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words. +// TotalFreq returns the total number of Words ever seen by the corpus. This number includes the count of repeat Words. func (c *Corpus) TotalFreq() int { - return c.totalFreq + return c.TotalWordFreq } // MaxWordLength returns the length of the longest known word in the corpus. func (c *Corpus) MaxWordLength() int { - return c.maxWordLength + return c.MaxWordLength_ } // WordProb returns the probability of a word appearing in the corpus. @@ -145,22 +145,22 @@ func (c *Corpus) WordProb(word string) (float64, bool) { return 0, false } - count := c.frequencies[id] - return float64(count) / float64(c.totalFreq), true + count := c.Frequencies[id] + return float64(count) / float64(c.TotalWordFreq), true } // Merge combines two corpuses. The receiver is the one that is mutated. func (c *Corpus) Merge(other *Corpus) { - for i, word := range other.words { - freq := other.frequencies[i] - if id, ok := c.ids[word]; ok { - c.frequencies[id] += freq - c.totalFreq += freq + for i, word := range other.Words { + freq := other.Frequencies[i] + if id, ok := c.Ids[word]; ok { + c.Frequencies[id] += freq + c.TotalWordFreq += freq } else { id := c.Add(word) - c.frequencies[id] += freq - 1 - c.totalFreq += freq - 1 + c.Frequencies[id] += freq - 1 + c.TotalWordFreq += freq - 1 } } } @@ -170,28 +170,28 @@ func (c *Corpus) Merge(other *Corpus) { // e.g: c.Replace("foo", "bar") // c.Id("foo") will still return a ID. The ID will be the same as c.Id("bar") func (c *Corpus) Replace(a, with string) error { - old, ok := c.ids[a] + old, ok := c.Ids[a] if !ok { return errors.Errorf("Cannot replace %q with %q. %q is not found", a, with, a) } - if _, ok := c.ids[with]; ok { + if _, ok := c.Ids[with]; ok { return errors.Errorf("Cannot replace %q with %q. %q exists in the corpus", a, with, with) } - c.words[old] = with - c.ids[with] = old + c.Words[old] = with + c.Ids[with] = old return nil } // ReplaceWord replaces the word associated with the given ID. The old reference remains. func (c *Corpus) ReplaceWord(id int, with string) error { - if id >= len(c.words) { + if id >= len(c.Words) { return errors.Errorf("Cannot replace word with ID %d. Out of bounds.", id) } - if _, ok := c.ids[with]; ok { + if _, ok := c.Ids[with]; ok { return errors.Errorf("Cannot replace word with ID %d with %q. %q exists in the corpus", id, with, with) } - c.words[id] = with - c.ids[with] = id + c.Words[id] = with + c.Ids[with] = id return nil } diff --git a/corpus_test.go b/corpus_test.go index 359d945..d4632c3 100644 --- a/corpus_test.go +++ b/corpus_test.go @@ -15,8 +15,8 @@ func TestCorpus(t *testing.T) { id := dict.Add("hello") assert.Equal(3, id) - assert.Equal([]string{"", "-UNKNOWN-", "-ROOT-", "hello"}, dict.words) - assert.Equal(map[string]int{"": 0, "-UNKNOWN-": 1, "-ROOT-": 2, "hello": 3}, dict.ids) + assert.Equal([]string{"", "-UNKNOWN-", "-ROOT-", "hello"}, dict.Words) + assert.Equal(map[string]int{"": 0, "-UNKNOWN-": 1, "-ROOT-": 2, "hello": 3}, dict.Ids) assert.Equal(4, dict.Size()) id2, ok := dict.Id("hello") @@ -50,16 +50,16 @@ func TestCorpus_Merge(t *testing.T) { dict := New() id := dict.Add("hello") - dict.frequencies[id] += 4 // freq for "hello" is 5 - dict.totalFreq += 4 + dict.Frequencies[id] += 4 // freq for "hello" is 5 + dict.TotalWordFreq += 4 other := New() id = other.Add("hello") - other.frequencies[id] += 2 // freq for "hello" is 3 - other.totalFreq += 2 + other.Frequencies[id] += 2 // freq for "hello" is 3 + other.TotalWordFreq += 2 id = other.Add("world") - other.frequencies[id] += 1 - other.totalFreq += 1 + other.Frequencies[id] += 1 + other.TotalWordFreq += 1 dict.Merge(other) diff --git a/functions.go b/functions.go index dc70215..87ebfb2 100644 --- a/functions.go +++ b/functions.go @@ -6,7 +6,7 @@ import ( "unicode/utf8" ) -// ViterbiSplit is a Viterbi algorithm for splitting words given a corpus +// ViterbiSplit is a Viterbi algorithm for splitting Words given a corpus func ViterbiSplit(input string, c *Corpus) []string { s := strings.ToLower(input) probabilities := []float64{1.0} @@ -22,7 +22,7 @@ func ViterbiSplit(input string, c *Corpus) []string { probs := make([]float64, 0) ls := make([]int, 0) - // m := maxInt(0, i-c.maxWordLength) + // m := maxInt(0, i-c.MaxWordLength_) for j, r := range runes { if r > i { @@ -32,7 +32,7 @@ func ViterbiSplit(input string, c *Corpus) []string { p, ok := c.WordProb(s[r : i+1]) if !ok { // http://stackoverflow.com/questions/195010/how-can-i-split-multiple-joined-words#comment48879458_481773 - p = (math.Log(float64(1)/float64(c.totalFreq)) - float64(c.maxWordLength) - float64(1)) * float64(i-r) // note it should be i-r not j-i as per the SO post + p = (math.Log(float64(1)/float64(c.TotalWordFreq)) - float64(c.MaxWordLength_) - float64(1)) * float64(i-r) // note it should be i-r not j-i as per the SO post } prob := probabilities[j] * p diff --git a/functions_test.go b/functions_test.go index c7812b7..252f6c3 100644 --- a/functions_test.go +++ b/functions_test.go @@ -24,8 +24,8 @@ func TestViterbiSplit(t *testing.T) { /* // FAILING TEST s2 = "curiouserandcuriouser" - words = ViterbiSplit(s2, dict) - assert.Equal([]string{"curiouser", "and", "curiouser"}, words) + Words = ViterbiSplit(s2, dict) + assert.Equal([]string{"curiouser", "and", "curiouser"}, Words) */ s3 := "thebestwaytoexplainitistodoit" diff --git a/io.go b/io.go index a2526a1..96ca0ff 100644 --- a/io.go +++ b/io.go @@ -28,11 +28,11 @@ func (s *sortutil) Swap(i, j int) { } } -// ToDictWithFreq returns a simple marshalable type. Conceptually it's a JSON object with the words as the keys. The values are a pair - ID and Freq. +// ToDictWithFreq returns a simple marshalable type. Conceptually it's a JSON object with the Words as the keys. The values are a pair - ID and Freq. func ToDictWithFreq(c *Corpus) map[string]struct{ ID, Freq int } { retVal := make(map[string]struct{ ID, Freq int }) - for i, w := range c.words { - retVal[w] = struct{ ID, Freq int }{i, c.frequencies[i]} + for i, w := range c.Words { + retVal[w] = struct{ ID, Freq int }{i, c.Frequencies[i]} } return retVal } @@ -40,7 +40,7 @@ func ToDictWithFreq(c *Corpus) map[string]struct{ ID, Freq int } { // ToDict returns a marshalable dict. It returns a copy of the ID mapping. func ToDict(c *Corpus) map[string]int { retVal := make(map[string]int) - for k, v := range c.ids { + for k, v := range c.Ids { retVal[k] = v } return retVal @@ -51,27 +51,27 @@ func (c *Corpus) GobEncode() ([]byte, error) { var buf bytes.Buffer encoder := gob.NewEncoder(&buf) - if err := encoder.Encode(c.words); err != nil { + if err := encoder.Encode(c.Words); err != nil { return nil, err } - if err := encoder.Encode(c.ids); err != nil { + if err := encoder.Encode(c.Ids); err != nil { return nil, err } - if err := encoder.Encode(c.frequencies); err != nil { + if err := encoder.Encode(c.Frequencies); err != nil { return nil, err } - if err := encoder.Encode(c.maxid); err != nil { + if err := encoder.Encode(c.MaxID); err != nil { return nil, err } - if err := encoder.Encode(c.totalFreq); err != nil { + if err := encoder.Encode(c.TotalWordFreq); err != nil { return nil, err } - if err := encoder.Encode(c.maxWordLength); err != nil { + if err := encoder.Encode(c.MaxWordLength_); err != nil { return nil, err } @@ -83,34 +83,34 @@ func (c *Corpus) GobDecode(buf []byte) error { b := bytes.NewBuffer(buf) decoder := gob.NewDecoder(b) - if err := decoder.Decode(&c.words); err != nil { + if err := decoder.Decode(&c.Words); err != nil { return err } - if err := decoder.Decode(&c.ids); err != nil { + if err := decoder.Decode(&c.Ids); err != nil { return err } - if err := decoder.Decode(&c.frequencies); err != nil { + if err := decoder.Decode(&c.Frequencies); err != nil { return err } - if err := decoder.Decode(&c.maxid); err != nil { + if err := decoder.Decode(&c.MaxID); err != nil { return err } - if err := decoder.Decode(&c.totalFreq); err != nil { + if err := decoder.Decode(&c.TotalWordFreq); err != nil { return err } - if err := decoder.Decode(&c.maxWordLength); err != nil { + if err := decoder.Decode(&c.MaxWordLength_); err != nil { return err } return nil } -// LoadOneGram loads a 1_gram.txt file, which is a tab separated file which lists the frequency counts of words. Example: +// LoadOneGram loads a 1_gram.txt file, which is a tab separated file which lists the frequency counts of Words. Example: // the 23135851162 // of 13151942776 // and 12997637966 @@ -135,13 +135,13 @@ func (c *Corpus) LoadOneGram(r io.Reader) error { } id := c.Add(word) - c.frequencies[id] = count - c.totalFreq-- - c.totalFreq += count + c.Frequencies[id] = count + c.TotalWordFreq-- + c.TotalWordFreq += count wc := len([]rune(word)) - if wc > c.maxWordLength { - c.maxWordLength = wc + if wc > c.MaxWordLength_ { + c.MaxWordLength_ = wc } } return nil diff --git a/io_test.go b/io_test.go index ce89f47..fd3a96c 100644 --- a/io_test.go +++ b/io_test.go @@ -52,12 +52,12 @@ func TestCorpusToDict(t *testing.T) { if err != nil { t.Fatal(err) } - assert.Equal(c.words, c2.words, "Expected words to be the same") - assert.Equal(c.ids, c2.ids, "Expected IDs to be the same") - assert.NotEqual(c.frequencies, c2.frequencies, "Expected frequencies to not be the same") - assert.Equal(c.maxid, c2.maxid, "Expected maxID to be the same") - assert.NotEqual(c.totalFreq, c2.totalFreq, "Expected totalFreq to be different.") - assert.Equal(c.maxWordLength, c2.maxWordLength, "Expected maxWordLength to be the same") + assert.Equal(c.Words, c2.Words, "Expected Words to be the same") + assert.Equal(c.Ids, c2.Ids, "Expected IDs to be the same") + assert.NotEqual(c.Frequencies, c2.Frequencies, "Expected Frequencies to not be the same") + assert.Equal(c.MaxID, c2.MaxID, "Expected maxID to be the same") + assert.NotEqual(c.TotalWordFreq, c2.TotalWordFreq, "Expected TotalWordFreq to be different.") + assert.Equal(c.MaxWordLength_, c2.MaxWordLength_, "Expected MaxWordLength_ to be the same") } func TestCorpusToDictWithFreq(t *testing.T) { @@ -86,7 +86,7 @@ func TestLoadOneGram(t *testing.T) { if !ok { t.Errorf("Expected \"for\" to be in corpus after loading one gram file") } - assert.Equal(int(c.maxid-1), id) + assert.Equal(int(c.MaxID-1), id) } func TestFromTextCorpus(t *testing.T) { @@ -116,8 +116,8 @@ func TestFromTextCorpus(t *testing.T) { // FOR DEBUG PURPOSES // g, err := os.OpenFile("testdata/tmp", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) // require.NoError(t, err) - // for i, w := range c.words { - // fmt.Fprintf(g, "%v %d\n", w, c.frequencies[i]) + // for i, w := range c.Words { + // fmt.Fprintf(g, "%v %d\n", w, c.Frequencies[i]) // } // g.Close() }