From d6bda2d61785b1d0a3d0cdc530fc1a4749b985b9 Mon Sep 17 00:00:00 2001 From: Erik Brinkman Date: Wed, 24 Apr 2024 21:07:27 -0400 Subject: [PATCH 1/2] fmt, clippy, updates --- Cargo.toml | 2 +- benches/lib.rs | 8 +- src/lib.rs | 272 +++++++++++++++++++++++++++++++--------------- src/metadata.rs | 76 ++++++++----- src/node_cache.rs | 3 +- tests/lib.rs | 55 ++++++---- 6 files changed, 270 insertions(+), 146 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 728321d..b2a249d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,6 @@ regex = "1.6.0" url = "2.3.1" [dev-dependencies] -env_logger = "0.9.3" +env_logger = "0.11.3" serde = {version = "1.0", features = ["derive"]} serde_json = {version = "1.0", features = ["std"]} diff --git a/benches/lib.rs b/benches/lib.rs index 168042c..2ee15b7 100644 --- a/benches/lib.rs +++ b/benches/lib.rs @@ -10,11 +10,10 @@ use url::Url; use readable_readability::Readability; - macro_rules! include_sample_file { ($name:ident, $file:expr) => { include_str!(concat!("../samples/", stringify!($name), "/", $file)) - } + }; } macro_rules! bench_sample { @@ -23,10 +22,11 @@ macro_rules! bench_sample { fn $name(b: &mut Bencher) { static SOURCE: &'static str = include_sample_file!($name, "source.html"); - b.iter(|| + b.iter(|| { Readability::new() .base_url(Url::parse("http://fakehost/test/page.html").unwrap()) - .parse(SOURCE)); + .parse(SOURCE) + }); } }; } diff --git a/src/lib.rs b/src/lib.rs index 8863fdf..daed2f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,15 +1,15 @@ use std::cmp; -use std::iter; use std::f32; use std::fmt; +use std::iter; -use regex::Regex; -use html5ever::{QualName, local_name, namespace_url, ns}; -use kuchiki::{NodeRef, NodeDataRef, NodeData, ElementData, Attributes}; -use kuchiki::traits::TendrilSink; +use html5ever::{local_name, namespace_url, ns, QualName}; use kuchiki::iter::NodeIterator; +use kuchiki::traits::TendrilSink; +use kuchiki::{Attributes, ElementData, NodeData, NodeDataRef, NodeRef}; use lazy_static::lazy_static; use log::trace; +use regex::Regex; use url::Url; pub use metadata::Metadata; @@ -27,7 +27,9 @@ trait NodeRefExt { fn node_ref(&self) -> &NodeRef; fn is(&self, name: QualName) -> bool { - self.node_ref().as_element().map_or(false, |e| e.name == name) + self.node_ref() + .as_element() + .map_or(false, |e| e.name == name) } fn replace(&self, node: &N) { @@ -43,7 +45,7 @@ trait NodeRefExt { let node = self.node_ref(); if let Some(elem) = node.as_element() { - // I'd like find a way to do this without clone(), but + // I'd like find a way to do this without clone(), but // I'm not sure how because BTreeMap doesn't have drain() let attributes = elem.attributes.borrow(); let replacement = NodeRef::new_element(name, attributes.map.clone()); @@ -117,7 +119,7 @@ lazy_static! { } macro_rules! tag { - ($name:tt) => { + ($name:tt) => { QualName { prefix: None, ns: ns!(html), @@ -127,7 +129,9 @@ macro_rules! tag { } macro_rules! attrib { - ($name:tt) => { local_name!($name) }; + ($name:tt) => { + local_name!($name) + }; } fn extract_byline(elem: &ElemRef) -> Option { @@ -169,8 +173,8 @@ fn is_unlikely_candidate(elem: &ElemRef) -> bool { let classes = attributes.get(attrib!("class")).unwrap_or(""); let id = attributes.get(attrib!("id")).unwrap_or(""); - (UNLIKELY_CANDIDATE.is_match(classes) || UNLIKELY_CANDIDATE.is_match(id)) && - !(MAYBE_CANDIDATE.is_match(classes) || MAYBE_CANDIDATE.is_match(id)) + (UNLIKELY_CANDIDATE.is_match(classes) || UNLIKELY_CANDIDATE.is_match(id)) + && !(MAYBE_CANDIDATE.is_match(classes) || MAYBE_CANDIDATE.is_match(id)) } fn transform_div(div: &ElemRef) { @@ -217,14 +221,18 @@ fn has_single_p(node: &NodeRef) -> bool { return false; } - node.children().text_nodes().all(|t| t.borrow().trim().is_empty()) + node.children() + .text_nodes() + .all(|t| t.borrow().trim().is_empty()) } fn has_block_elem(node: &NodeRef) -> bool { - node.descendants().elements().any(|elem| matches!{ - elem.name, - tag!("a") | tag!("blockquote") | tag!("dl") | tag!("div") | tag!("img") | tag!("ol") | - tag!("p") | tag!("pre") | tag!("table") | tag!("ul") | tag!("select") + node.descendants().elements().any(|elem| { + matches! { + elem.name, + tag!("a") | tag!("blockquote") | tag!("dl") | tag!("div") | tag!("img") | tag!("ol") | + tag!("p") | tag!("pre") | tag!("table") | tag!("ul") | tag!("select") + } }) } @@ -254,7 +262,7 @@ fn count_chars(text: &str) -> (u32, u32) { } fn is_tag_to_score(tag: &QualName) -> bool { - matches!{ + matches! { *tag, tag!("section") | tag!("p") | tag!("td") | tag!("pre") | tag!("h2") | tag!("h3") | tag!("h4") | tag!("h5") | tag!("h6") @@ -272,7 +280,7 @@ fn tag_score(tag: &QualName) -> f32 { tag!("body") => -5., tag!("h1") | tag!("h2") | tag!("h3") | tag!("h4") | tag!("h5") | tag!("h6") => -5., tag!("th") => -5., - _ => 0. + _ => 0., } } @@ -281,13 +289,21 @@ fn class_score(elem: &ElemRef) -> f32 { let mut score = 0.; if let Some(classes) = attributes.get(attrib!("class")) { - if POSITIVE.is_match(classes) { score += 25.; } - if NEGATIVE.is_match(classes) { score -= 25.; } + if POSITIVE.is_match(classes) { + score += 25.; + } + if NEGATIVE.is_match(classes) { + score -= 25.; + } } if let Some(id) = attributes.get(attrib!("id")) { - if POSITIVE.is_match(id) { score += 25.; } - if NEGATIVE.is_match(id) { score -= 25.; } + if POSITIVE.is_match(id) { + score += 25.; + } + if NEGATIVE.is_match(id) { + score -= 25.; + } } score @@ -298,8 +314,14 @@ fn is_stuffed(elem: &ElemRef, info: &NodeInfo) -> bool { // TODO: remove , etc. tag!("h1") | tag!("footer") | tag!("button") => false, - tag!("div") | tag!("section") | tag!("header") | - tag!("h2") | tag!("h3") | tag!("h4") | tag!("h5") | tag!("h6") => { + tag!("div") + | tag!("section") + | tag!("header") + | tag!("h2") + | tag!("h3") + | tag!("h4") + | tag!("h5") + | tag!("h6") => { if info.text_len == 0 { let children_count = elem.as_node().children().count() as u32; @@ -309,19 +331,23 @@ fn is_stuffed(elem: &ElemRef, info: &NodeInfo) -> bool { } true - }, + } tag!("thead") | tag!("tbody") | tag!("th") | tag!("tr") | tag!("td") => - // TODO: add