use std::{collections::VecDeque, rc::Rc}; use html5ever::{ local_name, namespace_url, ns, parse_fragment, tendril::TendrilSink, ParseOpts, QualName, }; use markup5ever_rcdom::{Handle, NodeData, RcDom}; use pulldown_cmark::{Event, Parser}; /// Simple word counting by considering every whitespace delimited grouped of characters a word. pub fn simple(s: &str) -> u32 { let mut words = 0; let mut prev_char_was_whitespace = true; for c in s.chars() { if c.is_whitespace() { if !prev_char_was_whitespace { words += 1; } prev_char_was_whitespace = true; } else { prev_char_was_whitespace = false; } } if !prev_char_was_whitespace { words += 1; } words } pub fn markdown(s: &str) -> u32 { let mut words = 0; let parser = Parser::new(s); for event in parser { match event { Event::Text(text) => { words += simple(&text); } Event::Code(text) => { words += simple(&text); } _ => (), } } words } pub fn html(s: &str) -> u32 { let dom = parse_fragment( RcDom::default(), ParseOpts::default(), QualName::new(None, ns!(html), local_name!("div")), vec![], ) .one(s); let mut nodes = VecDeque::::new(); // clone the dom.document Rc because otherwise it gets dropped at the end of the first loop // iteration and takes the children with it nodes.push_back(Rc::clone(&dom.document)); let mut words = 0; while let Some(front) = nodes.pop_front() { if let NodeData::Text { ref contents } = front.data { words += simple(contents.borrow().as_ref()); } let children = front.children.borrow(); nodes.reserve(children.len()); for child in children.iter() { nodes.push_back(Rc::clone(child)); } } words } #[cfg(test)] mod tests { use super::*; #[test] fn test_simple() { assert_eq!(simple("foo"), 1); assert_eq!(simple("foo bar"), 2); assert_eq!(simple(" foo\n\t bar "), 2); } #[test] fn test_markdown() { assert_eq!(markdown("# foo"), 1); assert_eq!(markdown("`foo`"), 1); } #[test] fn test_html() { assert_eq!(html("

test

"), 1); } }