@@ -24,25 +24,34 @@ class Readability implements LoggerAwareInterface
2424 public const MIN_ARTICLE_LENGTH = 200 ;
2525 public const MIN_NODE_LENGTH = 80 ;
2626 public const MAX_LINK_DENSITY = 0.25 ;
27- public $ convertLinksToFootnotes = false ;
28- public $ revertForcedParagraphElements = false ;
29- public $ articleTitle ;
30- public $ articleContent ;
31- public $ original_html ;
27+
28+ public bool $ convertLinksToFootnotes = false ;
29+ public bool $ revertForcedParagraphElements = false ;
30+
31+ public ?\DOMElement $ articleTitle ;
32+
33+ public ?\DOMElement $ articleContent ;
34+
35+ public ?string $ original_html ;
36+
37+ public ?\DOMDocument $ dom ;
38+
3239 /**
33- * @var \DOMDocument
40+ * @var ?string URL where HTML was retrieved
3441 */
35- public $ dom ;
36- // optional - URL where HTML was retrieved
37- public $ url = null ;
38- // preserves more content (experimental)
39- public $ lightClean = true ;
42+ public ?string $ url = null ;
4043
4144 /**
42- * All of the regular expressions in use within readability.
45+ * @var bool preserves more content (experimental)
46+ */
47+ public bool $ lightClean = true ;
48+
49+ /**
50+ * @var array<string, string> All of the regular expressions in use within readability.
51+ *
4352 * Defined up here so we don't instantiate them repeatedly in loops.
4453 */
45- public $ regexps = [
54+ public array $ regexps = [
4655 'unlikelyCandidates ' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i ' ,
4756 'okMaybeItsACandidate ' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i ' ,
4857 'positive ' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i ' ,
@@ -54,18 +63,30 @@ class Readability implements LoggerAwareInterface
5463 'hasContent ' => '/\S$/ ' ,
5564 'isNotVisible ' => '/display\s*:\s*none/ ' ,
5665 ];
57- public $ defaultTagsToScore = ['section ' , 'h2 ' , 'h3 ' , 'h4 ' , 'h5 ' , 'h6 ' , 'p ' , 'td ' , 'pre ' ];
58- // The commented out elements qualify as phrasing content but tend to be
59- // removed by readability when put into paragraphs, so we ignore them here.
60- public $ phrasingElements = [
66+
67+ /**
68+ * @var array<string>
69+ */
70+ public array $ defaultTagsToScore = ['section ' , 'h2 ' , 'h3 ' , 'h4 ' , 'h5 ' , 'h6 ' , 'p ' , 'td ' , 'pre ' ];
71+
72+ /**
73+ * @var array<string>
74+ */
75+ public array $ phrasingElements = [
76+ // The commented out elements qualify as phrasing content but tend to be
77+ // removed by readability when put into paragraphs, so we ignore them here.
6178 // "CANVAS", "IFRAME", "SVG", "VIDEO",
6279 'ABBR ' , 'AUDIO ' , 'B ' , 'BDO ' , 'BR ' , 'BUTTON ' , 'CITE ' , 'CODE ' , 'DATA ' ,
6380 'DATALIST ' , 'DFN ' , 'EM ' , 'EMBED ' , 'I ' , 'IMG ' , 'INPUT ' , 'KBD ' , 'LABEL ' ,
6481 'MARK ' , 'MATH ' , 'METER ' , 'NOSCRIPT ' , 'OBJECT ' , 'OUTPUT ' , 'PROGRESS ' , 'Q ' ,
6582 'RUBY ' , 'SAMP ' , 'SCRIPT ' , 'SELECT ' , 'SMALL ' , 'SPAN ' , 'STRONG ' , 'SUB ' ,
6683 'SUP ' , 'TEXTAREA ' , 'TIME ' , 'VAR ' , 'WBR ' ,
6784 ];
68- public $ tidy_config = [
85+
86+ /**
87+ * @var array<string, bool|string>
88+ */
89+ public array $ tidy_config = [
6990 'tidy-mark ' => false ,
7091 'vertical-space ' => false ,
7192 'doctype ' => 'omit ' ,
@@ -89,21 +110,41 @@ class Readability implements LoggerAwareInterface
89110 'output-encoding ' => 'utf8 ' ,
90111 'hide-comments ' => true ,
91112 ];
92- // article domain regexp for calibration
93- protected $ domainRegExp = null ;
94- protected $ body = null ;
95- // Cache the body HTML in case we need to re-use it later
96- protected $ bodyCache = null ;
97- // 1 | 2 | 4; // Start with all processing flags set.
98- protected $ flags = 7 ;
99- // indicates whether we were able to extract or not
100- protected $ success = false ;
101- protected $ logger ;
102- protected $ parser ;
103- protected $ html ;
104- protected $ useTidy ;
105- // raw HTML filters
106- protected $ pre_filters = [
113+
114+ /**
115+ * @var ?string article domain regexp for calibration
116+ */
117+ protected ?string $ domainRegExp = null ;
118+
119+ protected ?\DOMElement $ body = null ;
120+
121+ /**
122+ * @var ?string Cache the body HTML in case we need to re-use it later
123+ */
124+ protected ?string $ bodyCache = null ;
125+
126+ /**
127+ * @var int-mask-of<self::FLAG_*> start with all processing flags set
128+ */
129+ protected int $ flags = self ::FLAG_STRIP_UNLIKELYS | self ::FLAG_WEIGHT_ATTRIBUTES | self ::FLAG_CLEAN_CONDITIONALLY ;
130+
131+ /**
132+ * @var bool indicates whether we were able to extract or not
133+ */
134+ protected bool $ success = false ;
135+
136+ protected LoggerInterface $ logger ;
137+
138+ protected string $ parser ;
139+
140+ protected string $ html ;
141+
142+ protected bool $ useTidy ;
143+
144+ /**
145+ * @var array<string, string> raw HTML filters
146+ */
147+ protected array $ pre_filters = [
107148 // remove spans as we redefine styles and they're probably special-styled
108149 '!</?span[^>]*>!is ' => '' ,
109150 // HACK: firewall-filtered content
@@ -115,8 +156,11 @@ class Readability implements LoggerAwareInterface
115156 // replace fonts to spans
116157 '!<(/?)font[^>]*>!is ' => '< \\1span> ' ,
117158 ];
118- // output HTML filters
119- protected $ post_filters = [
159+
160+ /**
161+ * @var array<string, string> output HTML filters
162+ */
163+ protected array $ post_filters = [
120164 // replace excessive br's
121165 '/<br\s*\/?>\s*<p/i ' => '<p ' ,
122166 // replace empty tags that break layouts
@@ -156,20 +200,16 @@ public function setLogger(LoggerInterface $logger): void
156200
157201 /**
158202 * Get article title element.
159- *
160- * @return \DOMElement
161203 */
162- public function getTitle ()
204+ public function getTitle (): \ DOMElement
163205 {
164206 return $ this ->articleTitle ;
165207 }
166208
167209 /**
168210 * Get article content element.
169- *
170- * @return \DOMElement
171211 */
172- public function getContent ()
212+ public function getContent (): \ DOMElement
173213 {
174214 return $ this ->articleContent ;
175215 }
@@ -451,12 +491,8 @@ public function prepArticle(\DOMNode $articleContent): void
451491 /**
452492 * Get the inner text of a node.
453493 * This also strips out any excess whitespace to be found.
454- *
455- * @param \DOMElement $e
456- * @param bool $normalizeSpaces (default: true)
457- * @param bool $flattenLines (default: false)
458494 */
459- public function getInnerText ($ e , bool $ normalizeSpaces = true , bool $ flattenLines = false ): string
495+ public function getInnerText (\ DOMElement $ e , bool $ normalizeSpaces = true , bool $ flattenLines = false ): string
460496 {
461497 if (null === $ e || !isset ($ e ->textContent ) || '' === $ e ->textContent ) {
462498 return '' ;
@@ -749,10 +785,8 @@ public function removeFlag(int $flag): void
749785
750786 /**
751787 * Get the article title as an H1.
752- *
753- * @return \DOMElement
754788 */
755- protected function getArticleTitle ()
789+ protected function getArticleTitle (): \ DOMElement
756790 {
757791 try {
758792 $ curTitle = $ origTitle = $ this ->getInnerText ($ this ->dom ->getElementsByTagName ('title ' )->item (0 ));
@@ -892,7 +926,7 @@ protected function initializeNode(\DOMElement $node): void
892926 *
893927 * @return \DOMElement|false
894928 */
895- protected function grabArticle (?\DOMElement $ page = null )
929+ protected function grabArticle (?\DOMElement $ page = null ): \ DOMElement | bool
896930 {
897931 if (!$ page ) {
898932 $ page = $ this ->dom ;
0 commit comments