Skip to content

Commit 844b762

Browse files
author
Nito
committed
cleanText is now method
1 parent 9c5448a commit 844b762

File tree

4 files changed

+31
-8
lines changed

4 files changed

+31
-8
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ var_dump($eld->detect('Hola, cómo te llamas?'));
4040
print $eld->detect('Hola, cómo te llamas?')->language;
4141
// 'es'
4242

43-
// cleanText = true: Removes Urls, .com domains, emails, alphanumerical & numbers; from input text
44-
$eld->cleanText = true; // Default is false
43+
// cleanText(true): Removes Urls, .com domains, emails, alphanumerical & numbers; from input text
44+
$eld->cleanText(true); // Default is false
4545
```
4646

4747
- To reduce the languages to be detected, there are 3 different options, they only need to be executed once. (Check available [languages](#languages) below)

demo.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
// object( language => null|string, scores => null|array, isReliable() => bool )
2020
var_dump($result->language);
2121

22-
// cleanText = true: Removes Urls, .com domains, emails, alphanumerical & numbers
23-
$eld->cleanText = true; // Default is false
22+
// When cleanText(true) Removes Urls, .com domains, emails, alphanumerical & numbers
23+
$eld->cleanText(true); // Default is false
2424

2525
/*
2626
To reduce the languages to be detected, there are 3 different options, they only need to be executed once.

src/LanguageDetector.php

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
*/
1717
class LanguageDetector extends LanguageData
1818
{
19-
public bool $cleanText = false;
19+
protected bool $doCleanText = false;
2020
private array $wordStart;
2121

2222
public function __construct(?string $ngramsFile = null)
@@ -32,10 +32,11 @@ public function __construct(?string $ngramsFile = null)
3232
*/
3333
public function detect(string $text): LanguageResult
3434
{
35-
if ($this->cleanText) {
35+
if ($this->doCleanText) {
3636
// Removes Urls, emails, alphanumerical & numbers
3737
$text = $this->getCleanText($text);
3838
}
39+
3940
$text = $this->normalizeText($text);
4041
$textNgrams = $this->getByteNgrams($text);
4142
$numNgrams = count($textNgrams);
@@ -52,6 +53,11 @@ public function detect(string $text): LanguageResult
5253
return new LanguageResult();
5354
}
5455

56+
public function cleanText(bool $bool): void
57+
{
58+
$this->doCleanText = $bool; // Already cast in the argument
59+
}
60+
5561
/**
5662
* Removes parts of a string, that may be considered as "noise" for language detection
5763
*/
@@ -169,4 +175,5 @@ protected function calculateScores(array $textNgrams, int $numNgrams): array
169175

170176
return $results;
171177
}
178+
172179
}

tests/tests.php

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
}
6262
});
6363

64-
$tests->addTest('Clean text', function () {
64+
$tests->addTest('getCleanText function', function () {
6565
$eld = new Nitotm\Eld\LanguageDetector();
6666

6767
$text = "https://www.google.com/\n" .
@@ -76,6 +76,22 @@
7676
}
7777
});
7878

79+
$tests->addTest('Clean text option', function () {
80+
$eld = new Nitotm\Eld\LanguageDetector();
81+
$eld->cleanText(true);
82+
83+
$text = "https://www.google.com/\n" .
84+
"mail@gmail.com\n" .
85+
"google.com/search?q=search&source=hp\n" .
86+
"12345 A12345\n";
87+
88+
$result = $eld->detect($text);
89+
90+
if ($result->language !== NULL) {
91+
throw new Exception("Expected: NULL, but got " . json_encode($result));
92+
}
93+
});
94+
7995
$tests->addTest('Check minimum confidence', function () {
8096
$eld = new Nitotm\Eld\LanguageDetector('ngramsM60.php');
8197

@@ -204,4 +220,4 @@
204220
}
205221
});
206222

207-
$tests->run();
223+
$tests->run();

0 commit comments

Comments
 (0)