Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.text.similarity;

import java.util.Objects;

/**
* Represents the intersection result between two sets.
*
* <p>Stores the size of set A, set B and the intersection of A and B
* (<code>|A &#8745; B|</code>).</p>
*
* <p>This class is immutable.</p>
*
* @since 1.7
* @see <a href="https://en.wikipedia.org/wiki/Intersection_(set_theory)">Intersection</a>
*/
public class IntersectionResult {
/**
* The size of set A.
*/
private final int sizeA;
/**
* The size of set B.
*/
private final int sizeB;
/**
* The size of the intersection between set A and B.
*/
private final int intersection;

/**
* Create the results for an intersection between two sets.
*
* @param sizeA the size of set A ({@code |A|})
* @param sizeB the size of set B ({@code |B|})
* @param intersection the size of the intersection of A and B (<code>|A &#8745; B|</code>)
* @throws IllegalArgumentException if the sizes are negative or the intersection is greater
* than the minimum of the two set sizes
*/
public IntersectionResult(final int sizeA, final int sizeB, final int intersection) {
if (sizeA < 0) {
throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA);
}
if (sizeB < 0) {
throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB);
}
if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) {
throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection);
}
this.sizeA = sizeA;
this.sizeB = sizeB;
this.intersection = intersection;
}

/**
* Get the size of set A.
*
* @return |A|
*/
public int getSizeA() {
return sizeA;
}

/**
* Get the size of set B.
*
* @return |B|
*/
public int getSizeB() {
return sizeB;
}

/**
* Get the size of the intersection between set A and B.
*
* @return <code>|A &#8745; B|</code>
*/
public int getIntersection() {
return intersection;
}

@Override
public boolean equals(final Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
final IntersectionResult result = (IntersectionResult) o;
return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection;
}

@Override
public int hashCode() {
return Objects.hash(sizeA, sizeB, intersection);
}

@Override
public String toString() {
return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.text.similarity;

import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.function.Function;

/**
* Measures the intersection of two sets created from a pair of character sequences.
*
* <p>It is assumed that the type {@code T} correctly conforms to the requirements for storage
* within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements
* {@link Object#equals(Object)} and {@link Object#hashCode()}.</p>
*
* @param <T> the type of the elements extracted from the character sequence
* @since 1.7
* @see Set
* @see HashMap
*/
public class IntersectionSimilarity<T> implements SimilarityScore<IntersectionResult> {
/** The converter used to create the elements from the characters. */
private final Function<CharSequence, Collection<T>> converter;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we also need a DEFAULT_CHARACTER_CONVERTER in some class/interface? I needed one for characters, so just grabbed the one from the unit tests. I imagine other users with two CharacterSequence would probably use the default one too?

Something like

        public static final Function<CharSequence, Collection<Character>> DEFAULT_CHARACTER_CONVERTER = cs -> {
            final int length = cs.length();
            final Set<Character> set = new HashSet<>(length);
            for (int i = 0; i < length; i++) {
                set.add(cs.charAt(i));
            }
            return set;
        };

I think it is Commons CSV that provides some default parsers or readers.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this code gets pulled into use in JaccardSimilarity and SorensonDice then I can add a package private CharSequenceUtils that will have, e.g.:

Set toCharacterSet(CharSequence);
Set toBigramSet(CharSequence);
List toCharacterList(CharSequence);
List toBigramList(CharSequence);

That should be enough to handle the current use cases.

The unit test for creating words of uppercase bigrams is another algorithm but with no name. I do not think we want to support it.


// The following is adapted from commons-collections for a Bag.
// A Bag is a collection that can store the count of the number
// of copies of each element.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it work if we use the Bag from Commons Collections too? If so, we can add it as a dependency as we did with Commons Lang.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.

But once I got going I found the amount of code was minimal. This implementation is faster as it does not support fail-fast concurrent modification checking for threads and doesn't even know it's own size (saving unneeded counter incrementation).

I'd favour not adding a dependency just for this 15 lines of code and using the faster implementation.

However if we do use Bag from collections it offers the possibility that the user can pass one in (via the converter function) and the algorithm detects it.


/**
* Mutable counter class for storing the count of elements.
*/
private static class BagCount {
/** The count. This is initialised to 1 upon construction. */
int count = 1;
}

/**
* A minimal implementation of a Bag that can store elements and a count.
*
* <p>For the intended purpose the Bag does not have to be a {@link Collection}. It does not
* even have to know its own size.
*/
private class TinyBag {
/** The backing map. */
private final Map<T, BagCount> map;

/**
* Create a new tiny bag.
*
* @param initialCapacity the initial capacity
*/
TinyBag(int initialCapacity) {
map = new HashMap<>(initialCapacity);
}

/**
* Adds a new element to the bag, incrementing its count in the underlying map.
*
* @param object the object to add
*/
void add(T object) {
final BagCount mut = map.get(object);
if (mut == null) {
map.put(object, new BagCount());
} else {
mut.count++;
}
}

/**
* Returns the number of occurrence of the given element in this bag by
* looking up its count in the underlying map.
*
* @param object the object to search for
* @return the number of occurrences of the object, zero if not found
*/
int getCount(final Object object) {
final BagCount count = map.get(object);
if (count != null) {
return count.count;
}
return 0;
}

/**
* Returns a Set view of the mappings contained in this bag.
*
* @return the Set view
*/
Set<Entry<T, BagCount>> entrySet() {
return map.entrySet();
}

/**
* Get the number of unique elements in the bag.
*
* @return the unique element size
*/
int uniqueElementSize() {
return map.size();
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Returning a parallelStream might be bad here I think. Instead we should either default to normal stream, or give the user a way to choose whether to use a normal or a parallel stream (example discussion about it with some good references).

I had an issue with a parallel stream in a library some months ago, and took a while to identify where the problem was (it involved returning gridded data from an ArcGIS server in a JAXB web service in a legacy app being ported to Java 8... not the best of the experiences troubleshooting that).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good info.

I was just playing with the API. I'd be happy with a standard for loop over the collection KeySet or EntrySet:

intersection = 0;
for (Entry<T, BagCount> entry : bagA.map.entrySet()) {
    final T element = entry.getKey();
    final int count = entry.getValue().count;
    intersection += Math.min(count, bagB.getCount(element));
}

I do not know about performance. It would have to be tested using JMH verses the stream. But the stream would be doing a lot more work pushing objects around, converting to int then summing them.

}

/**
* Create a new intersection similarity using the provided converter.
*
* <p>If the converter returns a {@link Set} then the intersection result will
* not include duplicates. Any other {@link Collection} is used to produce a result
* that will include duplicates in the intersect and union.
*
* @param converter the converter used to create the elements from the characters
* @throws IllegalArgumentException if the converter is null
*/
public IntersectionSimilarity(Function<CharSequence, Collection<T>> converter) {
if (converter == null) {
throw new IllegalArgumentException("Converter must not be null");
}
this.converter = converter;
}

/**
* Calculates the intersection of two character sequences passed as input.
*
* @param left first character sequence
* @param right second character sequence
* @return the intersection result
* @throws IllegalArgumentException if either input sequence is {@code null}
*/
@Override
public IntersectionResult apply(final CharSequence left, final CharSequence right) {
if (left == null || right == null) {
throw new IllegalArgumentException("Input cannot be null");
}

// Create the elements from the sequences
final Collection<T> objectsA = converter.apply(left);
final Collection<T> objectsB = converter.apply(right);
final int sizeA = objectsA.size();
final int sizeB = objectsB.size();

// Short-cut if either collection is empty
if (Math.min(sizeA, sizeB) == 0) {
// No intersection
return new IntersectionResult(sizeA, sizeB, 0);
}

// Intersection = count the number of shared elements
int intersection;
if (objectsA instanceof Set && objectsB instanceof Set) {
// If a Set then the elements will only have a count of 1.
// Iterate over the smaller set.
intersection = (sizeA < sizeB)
? getIntersection((Set<T>) objectsA, (Set<T>) objectsB)
: getIntersection((Set<T>) objectsB, (Set<T>) objectsA);
} else {
// Create a bag for each collection
final TinyBag bagA = toBag(objectsA);
final TinyBag bagB = toBag(objectsB);
// Iterate over the smaller number of unique elements
intersection = (bagA.uniqueElementSize() < bagB.uniqueElementSize())
? getIntersection(bagA, bagB)
: getIntersection(bagB, bagA);
}

return new IntersectionResult(sizeA, sizeB, intersection);
}

/**
* Convert the collection to a bag. The bag will contain the count of each element
* in the collection.
*
* @param objects the objects
* @return the bag
*/
private TinyBag toBag(Collection<T> objects) {
final TinyBag bag = new TinyBag(objects.size());
for (T t : objects) {
bag.add(t);
}
return bag;
}

/**
* Compute the intersection between two sets. This is the count of all the elements
* that are within both sets.
*
* @param <T> the type of the elements in the set
* @param setA the set A
* @param setB the set B
* @return the intersection
*/
private static <T> int getIntersection(Set<T> setA, Set<T> setB) {
int intersection = 0;
for (T element : setA) {
if (setB.contains(element)) {
intersection++;
}
}
return intersection;
}

/**
* Compute the intersection between two bags. This is the sum of the minimum
* count of each element that is within both sets.
*
* @param bagA the bag A
* @param bagB the bag B
* @return the intersection
*/
private int getIntersection(TinyBag bagA, TinyBag bagB) {
int intersection = 0;
for (Entry<T, BagCount> entry : bagA.entrySet()) {
final T element = entry.getKey();
final int count = entry.getValue().count;
// The intersection of this entry in both bags is the minimum count
intersection += Math.min(count, bagB.getCount(element));
}
return intersection;
}
}
Loading