Skip to content

Commit d56e6d4

Browse files
committed
Add a new HTMLEntry
1 parent ee96580 commit d56e6d4

File tree

10 files changed

+332
-18
lines changed

10 files changed

+332
-18
lines changed

src/adapter/etl-adapter-parquet/tests/Flow/ETL/Adapter/Parquet/Tests/Unit/RowsNormalizerTest.php

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ enum_entry,
1212
enum_schema,
1313
float_entry,
1414
float_schema,
15+
html_entry,
16+
html_schema,
1517
int_entry,
1618
int_schema,
1719
json_entry,
@@ -72,6 +74,7 @@ public function test_normalization_nullable_entries() : void
7274
),
7375
enum_entry('enum', null),
7476
xml_entry('xml', null),
77+
html_entry('html', null),
7578
)
7679
);
7780
$schema = schema(
@@ -101,6 +104,7 @@ enum_entry('enum', null),
101104
),
102105
enum_schema('enum', BackedStringEnum::class, true),
103106
xml_schema('xml', true),
107+
html_schema('html', true),
104108
);
105109

106110
self::assertEquals(
@@ -119,6 +123,7 @@ enum_schema('enum', BackedStringEnum::class, true),
119123
'struct' => null,
120124
'enum' => null,
121125
'xml' => null,
126+
'html' => null,
122127
],
123128
],
124129
(new RowsNormalizer())->normalize($rows, $schema)

src/core/etl/src/Flow/ETL/DSL/functions.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@
165165
use Flow\ETL\Retry\RetryStrategy\{AnyThrowable, OnExceptionTypes};
166166
use Flow\ETL\Row\{Entries, EntryFactory, SortOrder};
167167
use Flow\ETL\Row\Entry\{BooleanEntry, DateEntry, DateTimeEntry, EnumEntry, FloatEntry, IntegerEntry, JsonEntry, ListEntry, MapEntry, StringEntry, StructureEntry, TimeEntry, UuidEntry, XMLElementEntry, XMLEntry};
168+
use Flow\ETL\Row\Entry\HTMLEntry;
168169
use Flow\ETL\Row\{Entry, EntryReference, Reference, References};
169170
use Flow\ETL\Row\Formatter\ASCIISchemaFormatter;
170171
use Flow\ETL\Schema\{Definition, Formatter\PHPFormatter\TypeFormatter, Formatter\PHPFormatter\ValueFormatter};
@@ -627,6 +628,15 @@ function xml_element_entry(string $name, \DOMElement|string|null $value, ?Metada
627628
return new XMLElementEntry($name, $value, $metadata);
628629
}
629630

631+
/**
632+
* @return Entry<?HTMLDocument>
633+
*/
634+
#[DocumentationDSL(module: Module::CORE, type: DSLType::ENTRY)]
635+
function html_entry(string $name, HTMLDocument|string|null $value, ?Metadata $metadata = null) : Entry
636+
{
637+
return new HTMLEntry($name, $value, $metadata);
638+
}
639+
630640
/**
631641
* @param Entry<mixed> ...$entries
632642
*/
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Row\Entry;
6+
7+
use function Flow\Types\DSL\{type_equals, type_html, type_optional};
8+
use Flow\ETL\Row\{Entry, Reference};
9+
use Flow\ETL\Schema\{Definition, Metadata};
10+
use Flow\Types\Type;
11+
use Flow\Types\Value\HTMLDocument;
12+
13+
/**
14+
* @implements Entry<?HTMLDocument>
15+
*/
16+
final class HTMLEntry implements Entry
17+
{
18+
use EntryRef;
19+
20+
private Metadata $metadata;
21+
22+
/**
23+
* @var Type<HTMLDocument>
24+
*/
25+
private readonly Type $type;
26+
27+
private ?HTMLDocument $value;
28+
29+
public function __construct(
30+
private readonly string $name,
31+
HTMLDocument|string|null $value,
32+
?Metadata $metadata = null,
33+
) {
34+
if (\is_string($value)) {
35+
$this->value = HTMLDocument::fromString($value);
36+
} else {
37+
$this->value = $value;
38+
}
39+
40+
$this->metadata = $metadata ?: Metadata::empty();
41+
$this->type = type_html();
42+
}
43+
44+
public function __toString() : string
45+
{
46+
return $this->toString();
47+
}
48+
49+
public function definition() : Definition
50+
{
51+
return new Definition($this->name, $this->type, null === $this->value, $this->metadata);
52+
}
53+
54+
public function duplicate() : self
55+
{
56+
return new self($this->name, $this->value ? clone $this->value : null, $this->metadata);
57+
}
58+
59+
public function is(Reference|string $name) : bool
60+
{
61+
if ($name instanceof Reference) {
62+
return $this->name === $name->name();
63+
}
64+
65+
return $this->name === $name;
66+
}
67+
68+
public function isEqual(Entry $entry) : bool
69+
{
70+
if (!$entry instanceof self || !$this->is($entry->name())) {
71+
return false;
72+
}
73+
74+
if (!type_equals($this->type, $entry->type)) {
75+
return false;
76+
}
77+
78+
return $entry->value()?->toString() === $this->value?->toString();
79+
}
80+
81+
public function map(callable $mapper) : self
82+
{
83+
return new self($this->name, $mapper($this->value));
84+
}
85+
86+
public function name() : string
87+
{
88+
return $this->name;
89+
}
90+
91+
public function rename(string $name) : self
92+
{
93+
return new self($name, $this->value);
94+
}
95+
96+
public function toString() : string
97+
{
98+
if (null === $this->value) {
99+
return '';
100+
}
101+
102+
return $this->value->toString();
103+
}
104+
105+
public function type() : Type
106+
{
107+
return $this->type;
108+
}
109+
110+
public function value() : ?HTMLDocument
111+
{
112+
return $this->value;
113+
}
114+
115+
public function withValue(mixed $value) : self
116+
{
117+
return new self($this->name, type_optional($this->type())->assert($value), $this->metadata);
118+
}
119+
}

src/core/etl/src/Flow/ETL/Row/EntryFactory.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
datetime_entry,
1010
enum_entry,
1111
float_entry,
12+
html_entry,
1213
int_entry,
1314
json_entry,
1415
json_object_entry,
@@ -51,6 +52,7 @@ enum_entry,
5152
UuidType,
5253
XMLElementType,
5354
XMLType};
55+
use Flow\Types\Type\Logical\HTMLType;
5456
use Flow\Types\Type\Native\{
5557
ArrayType,
5658
BooleanType,
@@ -171,6 +173,7 @@ public function createAs(string $entryName, mixed $value, Definition|Type $defin
171173
NullType::class => StringEntry::fromNull($entryName, $metadata),
172174
XMLType::class => xml_entry($entryName, null, $metadata),
173175
XMLElementType::class => xml_element_entry($entryName, null, $metadata),
176+
HTMLType::class => html_entry($entryName, null, $metadata),
174177
default => throw new InvalidArgumentException("Can't convert value into type \"{$type->toString()}\""),
175178
};
176179
}
@@ -234,6 +237,10 @@ public function createAs(string $entryName, mixed $value, Definition|Type $defin
234237
}
235238
}
236239

240+
if ($type instanceof HTMLType) {
241+
return html_entry($entryName, type_optional($type)->cast($value), $metadata);
242+
}
243+
237244
if ($type instanceof XMLType) {
238245
return xml_entry($entryName, type_optional($type)->cast($value), $metadata);
239246
}

src/core/etl/tests/Flow/ETL/Tests/Integration/DataFrame/DisplayTest.php

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ enum_entry,
1111
float_entry,
1212
from_array,
1313
from_rows,
14+
html_entry,
1415
int_entry,
1516
json_entry,
1617
list_entry,
@@ -75,6 +76,7 @@ public function extract(FlowContext $context) : \Generator
7576
),
7677
enum_entry('enum', BackedStringEnum::three),
7778
xml_entry('xml', '<xml><node id="123">test<foo>bar</foo></node></xml>'),
79+
html_entry('html', '<html lang="en"><body><div><span>bar</span></div></body></html>'),
7880
),
7981
);
8082
}
@@ -84,15 +86,15 @@ enum_entry('enum', BackedStringEnum::three),
8486

8587
self::assertCommandOutputIdentical(
8688
<<<'ASCIITABLE'
87-
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+
88-
| id | price | 100 | deleted | created-at | phase | array | list | map | items | enum | xml |
89-
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+
90-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
91-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
92-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
93-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
94-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
95-
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+
89+
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+----------------------+
90+
| id | price | 100 | deleted | created-at | phase | array | list | map | items | enum | xml | html |
91+
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+----------------------+
92+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <html lang="en"><bod |
93+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <html lang="en"><bod |
94+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <html lang="en"><bod |
95+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <html lang="en"><bod |
96+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <html lang="en"><bod |
97+
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+----------------------+
9698
5 rows
9799

98100
ASCIITABLE,
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Tests\Unit\Row\Entry;
6+
7+
use function Flow\ETL\DSL\html_entry;
8+
use Flow\ETL\Row\Entry\HTMLEntry;
9+
use Flow\ETL\Tests\FlowTestCase;
10+
use Flow\Types\Value\HTMLDocument;
11+
use PHPUnit\Framework\Attributes\DataProvider;
12+
13+
final class HTMLEntryTest extends FlowTestCase
14+
{
15+
public static function is_equal_data_provider() : \Generator
16+
{
17+
$doc1 = HTMLDocument::fromString('<html><body><div>2</div><p>3</p></body></html>');
18+
$doc2 = HTMLDocument::fromString('<html><body><div>2</div><p>3</p></body></html>');
19+
20+
yield 'equal names and equal simple html documents' => [
21+
true,
22+
html_entry('name', $doc1),
23+
html_entry('name', $doc2),
24+
];
25+
26+
$doc1 = HTMLDocument::fromString('<html><body><div id="id">2</div><p>3</p></body></html>');
27+
$doc2 = HTMLDocument::fromString('<html><body><div id="id">2</div><p>3</p></body></html>');
28+
29+
yield 'equal names and equal simple html documents with different order of attributes' => [
30+
true,
31+
html_entry('name', $doc1),
32+
html_entry('name', $doc2),
33+
];
34+
35+
$doc1 = HTMLDocument::fromString('<html><body><div id="foo">2</div><p>3</p></body></html>');
36+
$doc2 = HTMLDocument::fromString('<html><body><div id="bar">2</div><p>3</p></body></html>');
37+
38+
yield 'equal nodes but different attributes' => [
39+
false,
40+
html_entry('name', $doc1),
41+
html_entry('name', $doc2),
42+
];
43+
44+
$doc1 = HTMLDocument::fromString('<html><body><div id="id">2</div><p>3</p></body></html>');
45+
$doc2 = HTMLDocument::fromString('<html><body><p>3</p></body></html>');
46+
47+
yield 'equal attributes but different nodes' => [
48+
false,
49+
html_entry('name', $doc1),
50+
html_entry('name', $doc2),
51+
];
52+
53+
$doc1 = HTMLDocument::fromString('<html><body><div id="id">2</div><p>3</p></body></html>');
54+
$doc2 = HTMLDocument::fromString('');
55+
56+
yield 'compare with empty document' => [
57+
false,
58+
html_entry('name', $doc1),
59+
html_entry('name', $doc2),
60+
];
61+
62+
$doc1 = HTMLDocument::fromString('');
63+
$doc2 = HTMLDocument::fromString('');
64+
65+
yield 'compare twp empty documents' => [
66+
true,
67+
html_entry('name', $doc1),
68+
html_entry('name', $doc2),
69+
];
70+
}
71+
72+
public function test_canonicalization() : void
73+
{
74+
$doc = HTMLDocument::fromString('<html><body><div id="foo">2</div><p>3</p></body></html>');
75+
$doc2 = HTMLDocument::fromString(<<<'HTML'
76+
<html>
77+
<body>
78+
<div id="foo">2</div>
79+
<p>3</p>
80+
</body>
81+
</html>
82+
HTML);
83+
84+
self::assertNotEquals(
85+
html_entry('row', $doc),
86+
html_entry('row', $doc2),
87+
);
88+
}
89+
90+
public function test_creating_entry_from_valid_html_string() : void
91+
{
92+
$entry = html_entry('name', '<html><body><div id="id">2</div><p>3</p></body></html>');
93+
94+
self::assertSame('name', $entry->name());
95+
self::assertSame('<html><body><div id="id">2</div><p>3</p></body></html>', $entry->__toString());
96+
}
97+
98+
public function test_creating_html_entry_with_invalid_html_document() : void
99+
{
100+
$doc = HTMLDocument::fromString('body');
101+
$entry = html_entry('name', $doc);
102+
103+
self::assertSame('name', $entry->name());
104+
self::assertSame($doc, $entry->value());
105+
self::assertStringContainsString('body', $entry->toString());
106+
}
107+
108+
public function test_duplicating_entry() : void
109+
{
110+
$entry = html_entry('html', <<<'HTML'
111+
<html>
112+
<body>
113+
<div id="foo">2</div>
114+
<p>3</p>
115+
</body>
116+
</html>
117+
HTML);
118+
$duplicated = $entry->duplicate();
119+
120+
self::assertNotSame($entry, $duplicated);
121+
self::assertEquals($entry, $duplicated);
122+
}
123+
124+
#[DataProvider('is_equal_data_provider')]
125+
public function test_is_equal(bool $equals, HTMLEntry $entry, HTMLEntry $nextEntry) : void
126+
{
127+
self::assertSame($equals, $entry->isEqual($nextEntry));
128+
}
129+
}

0 commit comments

Comments
 (0)