-
Notifications
You must be signed in to change notification settings - Fork 274
TEXT-155: Add a generic OverlapSimilarity measure #109
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package org.apache.commons.text.similarity; | ||
|
|
||
| import java.util.Objects; | ||
|
|
||
| /** | ||
| * Represents the intersection result between two sets. | ||
| * | ||
| * <p>Stores the size of set A, set B and the intersection of A and B | ||
| * (<code>|A ∩ B|</code>).</p> | ||
| * | ||
| * <p>This class is immutable.</p> | ||
| * | ||
| * @since 1.7 | ||
| * @see <a href="https://en.wikipedia.org/wiki/Intersection_(set_theory)">Intersection</a> | ||
| */ | ||
| public class IntersectionResult { | ||
| /** | ||
| * The size of set A. | ||
| */ | ||
| private final int sizeA; | ||
| /** | ||
| * The size of set B. | ||
| */ | ||
| private final int sizeB; | ||
| /** | ||
| * The size of the intersection between set A and B. | ||
| */ | ||
| private final int intersection; | ||
|
|
||
| /** | ||
| * Create the results for an intersection between two sets. | ||
| * | ||
| * @param sizeA the size of set A ({@code |A|}) | ||
| * @param sizeB the size of set B ({@code |B|}) | ||
| * @param intersection the size of the intersection of A and B (<code>|A ∩ B|</code>) | ||
| * @throws IllegalArgumentException if the sizes are negative or the intersection is greater | ||
| * than the minimum of the two set sizes | ||
| */ | ||
| public IntersectionResult(final int sizeA, final int sizeB, final int intersection) { | ||
| if (sizeA < 0) { | ||
| throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA); | ||
| } | ||
| if (sizeB < 0) { | ||
| throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB); | ||
| } | ||
| if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) { | ||
| throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection); | ||
| } | ||
| this.sizeA = sizeA; | ||
| this.sizeB = sizeB; | ||
| this.intersection = intersection; | ||
| } | ||
|
|
||
| /** | ||
| * Get the size of set A. | ||
| * | ||
| * @return |A| | ||
| */ | ||
| public int getSizeA() { | ||
| return sizeA; | ||
| } | ||
|
|
||
| /** | ||
| * Get the size of set B. | ||
| * | ||
| * @return |B| | ||
| */ | ||
| public int getSizeB() { | ||
| return sizeB; | ||
| } | ||
|
|
||
| /** | ||
| * Get the size of the intersection between set A and B. | ||
| * | ||
| * @return <code>|A ∩ B|</code> | ||
| */ | ||
| public int getIntersection() { | ||
| return intersection; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(final Object o) { | ||
| if (this == o) { | ||
| return true; | ||
| } | ||
| if (o == null || getClass() != o.getClass()) { | ||
| return false; | ||
| } | ||
| final IntersectionResult result = (IntersectionResult) o; | ||
| return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection; | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(sizeA, sizeB, intersection); | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection; | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,237 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package org.apache.commons.text.similarity; | ||
|
|
||
| import java.util.Collection; | ||
| import java.util.HashMap; | ||
| import java.util.Map; | ||
| import java.util.Map.Entry; | ||
| import java.util.Set; | ||
| import java.util.function.Function; | ||
|
|
||
| /** | ||
| * Measures the intersection of two sets created from a pair of character sequences. | ||
| * | ||
| * <p>It is assumed that the type {@code T} correctly conforms to the requirements for storage | ||
| * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements | ||
| * {@link Object#equals(Object)} and {@link Object#hashCode()}.</p> | ||
| * | ||
| * @param <T> the type of the elements extracted from the character sequence | ||
| * @since 1.7 | ||
| * @see Set | ||
| * @see HashMap | ||
| */ | ||
| public class IntersectionSimilarity<T> implements SimilarityScore<IntersectionResult> { | ||
| /** The converter used to create the elements from the characters. */ | ||
| private final Function<CharSequence, Collection<T>> converter; | ||
|
|
||
| // The following is adapted from commons-collections for a Bag. | ||
| // A Bag is a collection that can store the count of the number | ||
| // of copies of each element. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it work if we use the Bag from Commons Collections too? If so, we can add it as a dependency as we did with Commons Lang.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. But once I got going I found the amount of code was minimal. This implementation is faster as it does not support fail-fast concurrent modification checking for threads and doesn't even know it's own size (saving unneeded counter incrementation). I'd favour not adding a dependency just for this 15 lines of code and using the faster implementation. However if we do use |
||
|
|
||
| /** | ||
| * Mutable counter class for storing the count of elements. | ||
| */ | ||
| private static class BagCount { | ||
| /** The count. This is initialised to 1 upon construction. */ | ||
| int count = 1; | ||
| } | ||
|
|
||
| /** | ||
| * A minimal implementation of a Bag that can store elements and a count. | ||
| * | ||
| * <p>For the intended purpose the Bag does not have to be a {@link Collection}. It does not | ||
| * even have to know its own size. | ||
| */ | ||
| private class TinyBag { | ||
| /** The backing map. */ | ||
| private final Map<T, BagCount> map; | ||
|
|
||
| /** | ||
| * Create a new tiny bag. | ||
| * | ||
| * @param initialCapacity the initial capacity | ||
| */ | ||
| TinyBag(int initialCapacity) { | ||
| map = new HashMap<>(initialCapacity); | ||
| } | ||
|
|
||
| /** | ||
| * Adds a new element to the bag, incrementing its count in the underlying map. | ||
| * | ||
| * @param object the object to add | ||
| */ | ||
| void add(T object) { | ||
| final BagCount mut = map.get(object); | ||
| if (mut == null) { | ||
| map.put(object, new BagCount()); | ||
| } else { | ||
| mut.count++; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns the number of occurrence of the given element in this bag by | ||
| * looking up its count in the underlying map. | ||
| * | ||
| * @param object the object to search for | ||
| * @return the number of occurrences of the object, zero if not found | ||
| */ | ||
| int getCount(final Object object) { | ||
| final BagCount count = map.get(object); | ||
| if (count != null) { | ||
| return count.count; | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| /** | ||
| * Returns a Set view of the mappings contained in this bag. | ||
| * | ||
| * @return the Set view | ||
| */ | ||
| Set<Entry<T, BagCount>> entrySet() { | ||
| return map.entrySet(); | ||
| } | ||
|
|
||
| /** | ||
| * Get the number of unique elements in the bag. | ||
| * | ||
| * @return the unique element size | ||
| */ | ||
| int uniqueElementSize() { | ||
| return map.size(); | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Returning a I had an issue with a parallel stream in a library some months ago, and took a while to identify where the problem was (it involved returning gridded data from an ArcGIS server in a JAXB web service in a legacy app being ported to Java 8... not the best of the experiences troubleshooting that).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good info. I was just playing with the API. I'd be happy with a standard for loop over the collection I do not know about performance. It would have to be tested using JMH verses the stream. But the stream would be doing a lot more work pushing objects around, converting to int then summing them. |
||
| } | ||
|
|
||
| /** | ||
| * Create a new intersection similarity using the provided converter. | ||
| * | ||
| * <p>If the converter returns a {@link Set} then the intersection result will | ||
| * not include duplicates. Any other {@link Collection} is used to produce a result | ||
| * that will include duplicates in the intersect and union. | ||
| * | ||
| * @param converter the converter used to create the elements from the characters | ||
| * @throws IllegalArgumentException if the converter is null | ||
| */ | ||
| public IntersectionSimilarity(Function<CharSequence, Collection<T>> converter) { | ||
| if (converter == null) { | ||
| throw new IllegalArgumentException("Converter must not be null"); | ||
| } | ||
| this.converter = converter; | ||
| } | ||
|
|
||
| /** | ||
| * Calculates the intersection of two character sequences passed as input. | ||
| * | ||
| * @param left first character sequence | ||
| * @param right second character sequence | ||
| * @return the intersection result | ||
| * @throws IllegalArgumentException if either input sequence is {@code null} | ||
| */ | ||
| @Override | ||
| public IntersectionResult apply(final CharSequence left, final CharSequence right) { | ||
| if (left == null || right == null) { | ||
| throw new IllegalArgumentException("Input cannot be null"); | ||
| } | ||
|
|
||
| // Create the elements from the sequences | ||
| final Collection<T> objectsA = converter.apply(left); | ||
| final Collection<T> objectsB = converter.apply(right); | ||
| final int sizeA = objectsA.size(); | ||
| final int sizeB = objectsB.size(); | ||
|
|
||
| // Short-cut if either collection is empty | ||
| if (Math.min(sizeA, sizeB) == 0) { | ||
| // No intersection | ||
| return new IntersectionResult(sizeA, sizeB, 0); | ||
| } | ||
|
|
||
| // Intersection = count the number of shared elements | ||
| int intersection; | ||
| if (objectsA instanceof Set && objectsB instanceof Set) { | ||
| // If a Set then the elements will only have a count of 1. | ||
| // Iterate over the smaller set. | ||
| intersection = (sizeA < sizeB) | ||
| ? getIntersection((Set<T>) objectsA, (Set<T>) objectsB) | ||
| : getIntersection((Set<T>) objectsB, (Set<T>) objectsA); | ||
| } else { | ||
| // Create a bag for each collection | ||
| final TinyBag bagA = toBag(objectsA); | ||
| final TinyBag bagB = toBag(objectsB); | ||
| // Iterate over the smaller number of unique elements | ||
| intersection = (bagA.uniqueElementSize() < bagB.uniqueElementSize()) | ||
| ? getIntersection(bagA, bagB) | ||
| : getIntersection(bagB, bagA); | ||
| } | ||
|
|
||
| return new IntersectionResult(sizeA, sizeB, intersection); | ||
| } | ||
|
|
||
| /** | ||
| * Convert the collection to a bag. The bag will contain the count of each element | ||
| * in the collection. | ||
| * | ||
| * @param objects the objects | ||
| * @return the bag | ||
| */ | ||
| private TinyBag toBag(Collection<T> objects) { | ||
| final TinyBag bag = new TinyBag(objects.size()); | ||
| for (T t : objects) { | ||
| bag.add(t); | ||
| } | ||
| return bag; | ||
| } | ||
|
|
||
| /** | ||
| * Compute the intersection between two sets. This is the count of all the elements | ||
| * that are within both sets. | ||
| * | ||
| * @param <T> the type of the elements in the set | ||
| * @param setA the set A | ||
| * @param setB the set B | ||
| * @return the intersection | ||
| */ | ||
| private static <T> int getIntersection(Set<T> setA, Set<T> setB) { | ||
| int intersection = 0; | ||
| for (T element : setA) { | ||
| if (setB.contains(element)) { | ||
| intersection++; | ||
| } | ||
| } | ||
| return intersection; | ||
| } | ||
|
|
||
| /** | ||
| * Compute the intersection between two bags. This is the sum of the minimum | ||
| * count of each element that is within both sets. | ||
| * | ||
| * @param bagA the bag A | ||
| * @param bagB the bag B | ||
| * @return the intersection | ||
| */ | ||
| private int getIntersection(TinyBag bagA, TinyBag bagB) { | ||
| int intersection = 0; | ||
| for (Entry<T, BagCount> entry : bagA.entrySet()) { | ||
| final T element = entry.getKey(); | ||
| final int count = entry.getValue().count; | ||
| // The intersection of this entry in both bags is the minimum count | ||
| intersection += Math.min(count, bagB.getCount(element)); | ||
| } | ||
| return intersection; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we also need a
DEFAULT_CHARACTER_CONVERTERin some class/interface? I needed one for characters, so just grabbed the one from the unit tests. I imagine other users with twoCharacterSequencewould probably use the default one too?Something like
I think it is Commons CSV that provides some default parsers or readers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this code gets pulled into use in JaccardSimilarity and SorensonDice then I can add a package private
CharSequenceUtilsthat will have, e.g.:Set toCharacterSet(CharSequence);
Set toBigramSet(CharSequence);
List toCharacterList(CharSequence);
List toBigramList(CharSequence);
That should be enough to handle the current use cases.
The unit test for creating words of uppercase bigrams is another algorithm but with no name. I do not think we want to support it.