v6/src/generator/util/word_count.rs

94 lines
2.4 KiB
Rust

use std::{collections::VecDeque, rc::Rc};
use html5ever::{
local_name, namespace_url, ns, parse_fragment, tendril::TendrilSink, ParseOpts, QualName,
};
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use pulldown_cmark::{Event, Parser};
/// Simple word counting by considering every whitespace delimited grouped of characters a word.
pub fn simple(s: &str) -> u32 {
let mut words = 0;
let mut prev_char_was_whitespace = true;
for c in s.chars() {
if c.is_whitespace() {
if !prev_char_was_whitespace {
words += 1;
}
prev_char_was_whitespace = true;
} else {
prev_char_was_whitespace = false;
}
}
if !prev_char_was_whitespace {
words += 1;
}
words
}
pub fn markdown(s: &str) -> u32 {
let mut words = 0;
let parser = Parser::new(s);
for event in parser {
match event {
Event::Text(text) => {
words += simple(&text);
}
Event::Code(text) => {
words += simple(&text);
}
_ => (),
}
}
words
}
pub fn html(s: &str) -> u32 {
let dom = parse_fragment(
RcDom::default(),
ParseOpts::default(),
QualName::new(None, ns!(html), local_name!("div")),
vec![],
)
.one(s);
let mut nodes = VecDeque::<Handle>::new();
// clone the dom.document Rc because otherwise it gets dropped at the end of the first loop
// iteration and takes the children with it
nodes.push_back(Rc::clone(&dom.document));
let mut words = 0;
while let Some(front) = nodes.pop_front() {
if let NodeData::Text { ref contents } = front.data {
words += simple(contents.borrow().as_ref());
}
let children = front.children.borrow();
nodes.reserve(children.len());
for child in children.iter() {
nodes.push_back(Rc::clone(child));
}
}
words
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple() {
assert_eq!(simple("foo"), 1);
assert_eq!(simple("foo bar"), 2);
assert_eq!(simple(" foo\n\t bar "), 2);
}
#[test]
fn test_markdown() {
assert_eq!(markdown("# foo"), 1);
assert_eq!(markdown("`foo`"), 1);
}
#[test]
fn test_html() {
assert_eq!(html("<h1> test </h1>"), 1);
}
}