Skip to content

Commit aedd3ed

Browse files
committed
add n-gram jaccard index similarity
1 parent 2e0c5be commit aedd3ed

7 files changed

Lines changed: 292 additions & 38 deletions

File tree

README.md

Lines changed: 48 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,45 +19,74 @@ composer require webd/language
1919
## Usage
2020

2121
```php
22+
// ------------ n-gram string similarity
23+
24+
use webd\language\StringSimilarity\DiceCoefficient;
25+
use webd\language\StringSimilarity\JaccardSimilarity;
26+
27+
28+
$dice = new DiceCoefficient(2);
29+
$jaccard = new JaccardSimilarity(2);
30+
31+
$a = "context";
32+
$b = "contact";
33+
34+
// 0.5
35+
echo $dice->similarity($a, $b) . PHP_EOL;
36+
37+
// context : ["co", "on", "nt", "te", "ex", "xt"]
38+
// contact : ["co", "on", "nt", "ta", "ac", "ct"]
39+
// jacccard similarity : 3 / 9
40+
// 0.33333
41+
echo $jaccard->similarity($a, $b) . PHP_EOL;
42+
43+
// ------------ string distance
44+
2245
use webd\language\StringDistance;
2346

2447
$string1 = "You won 10000$";
2548
$string2 = "You won 15500$";
2649

2750
// 2
28-
echo "Edit distance : " . StringDistance::editDistance($string1, $string2) . "\n";
51+
echo "Edit distance : " . StringDistance::editDistance($string1, $string2) . PHP_EOL;
2952

3053
// 2
31-
echo "Levenshtein : " . StringDistance::levenshtein($string1, $string2) . "\n";
54+
echo "Levenshtein : " . StringDistance::levenshtein($string1, $string2) . PHP_EOL;
55+
56+
57+
$lcs = new \webd\language\LCS($string1, $string2);
58+
// You won 100$
59+
echo $lcs->value() . PHP_EOL;
60+
61+
// 12
62+
echo $lcs->length() . PHP_EOL;
63+
64+
// 4
65+
echo $lcs->distance() . PHP_EOL;
66+
67+
// -------------- jaro-winkler string similarity
3268

3369
// 0.96428571428571
34-
echo "Jaro-Winkler : " . StringDistance::jaroWinkler($string1, $string2) . "\n";
70+
echo "Jaro-Winkler : " . StringDistance::jaroWinkler($string1, $string2) . PHP_EOL;
3571

3672
// 0.98809523809524
37-
echo "Jaro-Winkler (prefix scale = 0.2) : " . StringDistance::jaroWinkler($string1, $string2, 0.2) . "\n";
73+
echo "Jaro-Winkler (prefix scale = 0.2) : " . StringDistance::jaroWinkler($string1, $string2, 0.2) . PHP_EOL;
74+
75+
// -------------- stemming
3876

3977
use webd\language\PorterStemmer;
78+
4079
// analyz
41-
echo "analyzing => " . PorterStemmer::stem("analyzing") . "\n";
80+
echo "analyzing => " . PorterStemmer::stem("analyzing") . PHP_EOL;
4281

4382
// abandon
44-
echo "abandoned => " . PorterStemmer::stem("abandoned") . "\n";
83+
echo "abandoned => " . PorterStemmer::stem("abandoned") . PHP_EOL;
4584

4685
// inclin
47-
echo "inclination => " . PorterStemmer::stem("inclination") . "\n";
48-
49-
$lcs = new \webd\language\LCS($string1, $string2);
50-
// You won 100$
51-
echo $lcs->value() . "\n";
52-
53-
// 12
54-
echo $lcs->length() . "\n";
55-
56-
// 4
57-
echo $lcs->distance() . "\n";
86+
echo "inclination => " . PorterStemmer::stem("inclination") . PHP_EOL;
5887

59-
// SpamSum, aka ssdeep, aka Context-Triggered Piecewize Hashing (CTPH):
88+
// ------------- SpamSum, aka ssdeep, aka Context-Triggered Piecewize Hashing (CTPH)
6089
$s = new \webd\language\SpamSum;
6190
// 192:x+cMdRiWqk2YODjCoG4OU88/ffcQ+lsCYDIlp6+TF244htoJFUjw:krovCLA9byp6+52jhtnjw
62-
echo $s->HashString(file_get_contents(__DIR__ . "/SpamSum.php")) . "\n";
91+
echo $s->HashString(file_get_contents(__DIR__ . "/SpamSum.php")) . PHP_EOL;
6392
```
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<?php
2+
3+
namespace webd\language\StringSimilarity;
4+
5+
/**
6+
* Description of DiceCoefficient
7+
*
8+
* @author tibo
9+
*/
10+
class DiceCoefficient implements StringSimilarity
11+
{
12+
use NGramTrait;
13+
14+
private int $n;
15+
16+
public function __construct(int $n = 2)
17+
{
18+
$this->n = $n;
19+
}
20+
21+
public function similarity(string $a, string $b): float
22+
{
23+
if ($a === $b) {
24+
return 1.0;
25+
}
26+
27+
$ngramsA = $this->ngrams($a, $this->n);
28+
$ngramsB = $this->ngrams($b, $this->n);
29+
30+
if (empty($ngramsA) || empty($ngramsB)) {
31+
return 0.0;
32+
}
33+
34+
$countsA = array_count_values($ngramsA);
35+
$countsB = array_count_values($ngramsB);
36+
37+
$intersection = 0;
38+
39+
foreach ($countsA as $gram => $countA) {
40+
if (isset($countsB[$gram])) {
41+
$intersection += min($countA, $countsB[$gram]);
42+
}
43+
}
44+
45+
return (2 * $intersection) / (count($ngramsA) + count($ngramsB));
46+
}
47+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<?php
2+
3+
namespace webd\language\StringSimilarity;
4+
5+
/**
6+
* Description of JaccardSimilarity
7+
*
8+
* @author tibo
9+
*/
10+
class JaccardSimilarity implements StringSimilarity
11+
{
12+
use NGramTrait;
13+
14+
private int $n;
15+
16+
public function __construct(int $n = 2)
17+
{
18+
$this->n = $n;
19+
}
20+
21+
public function similarity(string $a, string $b): float
22+
{
23+
$ngramsA = array_unique($this->ngrams($a, $this->n));
24+
$ngramsB = array_unique($this->ngrams($b, $this->n));
25+
26+
$intersection = array_intersect($ngramsA, $ngramsB);
27+
$union = array_unique(array_merge($ngramsA, $ngramsB));
28+
29+
if (count($union) === 0) {
30+
return 0.0;
31+
}
32+
33+
return count($intersection) / count($union);
34+
}
35+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<?php
2+
3+
namespace webd\language\StringSimilarity;
4+
5+
/**
6+
* Description of NGramTrait
7+
*
8+
* @author tibo
9+
*/
10+
trait NGramTrait
11+
{
12+
protected function ngrams(string $text, int $n): array
13+
{
14+
$text = mb_strtolower($text, 'UTF-8');
15+
$len = mb_strlen($text, 'UTF-8');
16+
17+
$grams = [];
18+
for ($i = 0; $i <= $len - $n; $i++) {
19+
$grams[] = mb_substr($text, $i, $n, 'UTF-8');
20+
}
21+
22+
return $grams;
23+
}
24+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<?php
2+
3+
namespace webd\language\StringSimilarity;
4+
5+
/**
6+
* Description of StringSimilarity
7+
*
8+
* @author tibo
9+
*/
10+
interface StringSimilarity
11+
{
12+
public function similarity(string $a, string $b): float;
13+
}

src/example.php

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,45 +2,73 @@
22

33
require __DIR__ . "/../vendor/autoload.php";
44

5+
// ------------ n-gram string similarity
6+
7+
use webd\language\StringSimilarity\DiceCoefficient;
8+
use webd\language\StringSimilarity\JaccardSimilarity;
9+
10+
11+
$dice = new DiceCoefficient(2);
12+
$jaccard = new JaccardSimilarity(2);
13+
14+
$a = "context";
15+
$b = "contact";
16+
17+
// 0.5
18+
echo $dice->similarity($a, $b) . PHP_EOL;
19+
20+
// context : ["co", "on", "nt", "te", "ex", "xt"]
21+
// contact : ["co", "on", "nt", "ta", "ac", "ct"]
22+
// jacccard similarity : 3 / 9
23+
// 0.33333
24+
echo $jaccard->similarity($a, $b) . PHP_EOL;
25+
26+
// ------------ string distance
27+
528
use webd\language\StringDistance;
629

730
$string1 = "You won 10000$";
831
$string2 = "You won 15500$";
932

1033
// 2
11-
echo "Edit distance : " . StringDistance::editDistance($string1, $string2) . "\n";
34+
echo "Edit distance : " . StringDistance::editDistance($string1, $string2) . PHP_EOL;
1235

1336
// 2
14-
echo "Levenshtein : " . StringDistance::levenshtein($string1, $string2) . "\n";
37+
echo "Levenshtein : " . StringDistance::levenshtein($string1, $string2) . PHP_EOL;
38+
39+
40+
$lcs = new \webd\language\LCS($string1, $string2);
41+
// You won 100$
42+
echo $lcs->value() . PHP_EOL;
43+
44+
// 12
45+
echo $lcs->length() . PHP_EOL;
46+
47+
// 4
48+
echo $lcs->distance() . PHP_EOL;
49+
50+
// -------------- jaro-winkler string similarity
1551

1652
// 0.96428571428571
17-
echo "Jaro-Winkler : " . StringDistance::jaroWinkler($string1, $string2) . "\n";
53+
echo "Jaro-Winkler : " . StringDistance::jaroWinkler($string1, $string2) . PHP_EOL;
1854

1955
// 0.98809523809524
20-
echo "Jaro-Winkler (prefix scale = 0.2) : " . StringDistance::jaroWinkler($string1, $string2, 0.2) . "\n";
56+
echo "Jaro-Winkler (prefix scale = 0.2) : " . StringDistance::jaroWinkler($string1, $string2, 0.2) . PHP_EOL;
57+
58+
// -------------- stemming
2159

2260
use webd\language\PorterStemmer;
2361

2462
// analyz
25-
echo "analyzing => " . PorterStemmer::stem("analyzing") . "\n";
63+
echo "analyzing => " . PorterStemmer::stem("analyzing") . PHP_EOL;
2664

2765
// abandon
28-
echo "abandoned => " . PorterStemmer::stem("abandoned") . "\n";
66+
echo "abandoned => " . PorterStemmer::stem("abandoned") . PHP_EOL;
2967

3068
// inclin
31-
echo "inclination => " . PorterStemmer::stem("inclination") . "\n";
32-
33-
$lcs = new \webd\language\LCS($string1, $string2);
34-
// You won 100$
35-
echo $lcs->value() . "\n";
36-
37-
// 12
38-
echo $lcs->length() . "\n";
39-
40-
// 4
41-
echo $lcs->distance() . "\n";
69+
echo "inclination => " . PorterStemmer::stem("inclination") . PHP_EOL;
4270

43-
// SpamSum, aka ssdeep, aka Context-Triggered Piecewize Hashing (CTPH):
71+
// ------------- SpamSum, aka ssdeep, aka Context-Triggered Piecewize Hashing (CTPH)
4472
$s = new \webd\language\SpamSum;
4573
// 192:x+cMdRiWqk2YODjCoG4OU88/ffcQ+lsCYDIlp6+TF244htoJFUjw:krovCLA9byp6+52jhtnjw
46-
echo $s->HashString(file_get_contents(__DIR__ . "/SpamSum.php")) . "\n";
74+
echo $s->HashString(file_get_contents(__DIR__ . "/SpamSum.php")) . PHP_EOL;
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
<?php
2+
3+
namespace webd\language\StringSimilarity;
4+
5+
use PHPUnit\Framework\TestCase;
6+
7+
class JaccardSimilarityTest extends TestCase
8+
{
9+
/**
10+
* @covers webd\language\StringSimilarity\JaccardSimilarity::similarity
11+
*/
12+
public function testSimilarityIdentical()
13+
{
14+
$sim = new JaccardSimilarity();
15+
$a = "hello world";
16+
$this->assertEquals(1.0, $sim->similarity($a, $a));
17+
}
18+
19+
/**
20+
* @covers webd\language\StringSimilarity\JaccardSimilarity::similarity
21+
*/
22+
public function testSimilarityCompletelyDifferent()
23+
{
24+
$sim = new JaccardSimilarity();
25+
$a = "abcdef";
26+
$b = "ghijkl";
27+
$this->assertEquals(0.0, $sim->similarity($a, $b));
28+
}
29+
30+
/**
31+
* @covers webd\language\StringSimilarity\JaccardSimilarity::similarity
32+
*/
33+
public function testSimilarityEmptyStrings()
34+
{
35+
$sim = new JaccardSimilarity();
36+
$this->assertEquals(0.0, $sim->similarity("", ""));
37+
}
38+
39+
/**
40+
* @covers webd\language\StringSimilarity\JaccardSimilarity::similarity
41+
*/
42+
public function testSimilarityCustomN()
43+
{
44+
$sim = new JaccardSimilarity(3);
45+
$a = "abcde"; // 3-grams: abc, bcd, cde
46+
$b = "cdefg"; // 3-grams: cde, def, efg
47+
// intersection: cde = 1; union: abc, bcd, cde, def, efg = 5
48+
$expected = 1 / 5;
49+
$this->assertEqualsWithDelta($expected, $sim->similarity($a, $b), 1e-9);
50+
}
51+
52+
/**
53+
* @covers webd\language\StringSimilarity\JaccardSimilarity::similarity
54+
*/
55+
public function testSimilaritySymmetry()
56+
{
57+
$sim = new JaccardSimilarity();
58+
$a = "test string";
59+
$b = "string test";
60+
$this->assertEqualsWithDelta(
61+
$sim->similarity($a, $b),
62+
$sim->similarity($b, $a),
63+
1e-9
64+
);
65+
}
66+
67+
/**
68+
* @covers webd\language\StringSimilarity\JaccardSimilarity::similarity
69+
*/
70+
public function testSimilarityWithN1()
71+
{
72+
$sim = new JaccardSimilarity(1);
73+
$a = "aabb"; // chars: a,b
74+
$b = "abbb"; // chars: a,b
75+
// intersection: a,b = 2; union: a,b = 2 => 1.0
76+
$this->assertEquals(1.0, $sim->similarity($a, $b));
77+
}
78+
}

0 commit comments

Comments
 (0)