94 lines
2.4 KiB
Rust
94 lines
2.4 KiB
Rust
|
use std::{collections::VecDeque, rc::Rc};
|
||
|
|
||
|
use html5ever::{
|
||
|
local_name, namespace_url, ns, parse_fragment, tendril::TendrilSink, ParseOpts, QualName,
|
||
|
};
|
||
|
use markup5ever_rcdom::{Handle, NodeData, RcDom};
|
||
|
use pulldown_cmark::{Event, Parser};
|
||
|
|
||
|
/// Simple word counting by considering every whitespace delimited grouped of characters a word.
|
||
|
pub fn simple(s: &str) -> u32 {
|
||
|
let mut words = 0;
|
||
|
let mut prev_char_was_whitespace = true;
|
||
|
for c in s.chars() {
|
||
|
if c.is_whitespace() {
|
||
|
if !prev_char_was_whitespace {
|
||
|
words += 1;
|
||
|
}
|
||
|
prev_char_was_whitespace = true;
|
||
|
} else {
|
||
|
prev_char_was_whitespace = false;
|
||
|
}
|
||
|
}
|
||
|
if !prev_char_was_whitespace {
|
||
|
words += 1;
|
||
|
}
|
||
|
words
|
||
|
}
|
||
|
|
||
|
pub fn markdown(s: &str) -> u32 {
|
||
|
let mut words = 0;
|
||
|
let parser = Parser::new(s);
|
||
|
for event in parser {
|
||
|
match event {
|
||
|
Event::Text(text) => {
|
||
|
words += simple(&text);
|
||
|
}
|
||
|
Event::Code(text) => {
|
||
|
words += simple(&text);
|
||
|
}
|
||
|
_ => (),
|
||
|
}
|
||
|
}
|
||
|
words
|
||
|
}
|
||
|
|
||
|
pub fn html(s: &str) -> u32 {
|
||
|
let dom = parse_fragment(
|
||
|
RcDom::default(),
|
||
|
ParseOpts::default(),
|
||
|
QualName::new(None, ns!(html), local_name!("div")),
|
||
|
vec![],
|
||
|
)
|
||
|
.one(s);
|
||
|
let mut nodes = VecDeque::<Handle>::new();
|
||
|
// clone the dom.document Rc because otherwise it gets dropped at the end of the first loop
|
||
|
// iteration and takes the children with it
|
||
|
nodes.push_back(Rc::clone(&dom.document));
|
||
|
let mut words = 0;
|
||
|
while let Some(front) = nodes.pop_front() {
|
||
|
if let NodeData::Text { ref contents } = front.data {
|
||
|
words += simple(contents.borrow().as_ref());
|
||
|
}
|
||
|
let children = front.children.borrow();
|
||
|
nodes.reserve(children.len());
|
||
|
for child in children.iter() {
|
||
|
nodes.push_back(Rc::clone(child));
|
||
|
}
|
||
|
}
|
||
|
words
|
||
|
}
|
||
|
|
||
|
#[cfg(test)]
|
||
|
mod tests {
|
||
|
use super::*;
|
||
|
|
||
|
#[test]
|
||
|
fn test_simple() {
|
||
|
assert_eq!(simple("foo"), 1);
|
||
|
assert_eq!(simple("foo bar"), 2);
|
||
|
assert_eq!(simple(" foo\n\t bar "), 2);
|
||
|
}
|
||
|
|
||
|
#[test]
|
||
|
fn test_markdown() {
|
||
|
assert_eq!(markdown("# foo"), 1);
|
||
|
assert_eq!(markdown("`foo`"), 1);
|
||
|
}
|
||
|
|
||
|
#[test]
|
||
|
fn test_html() {
|
||
|
assert_eq!(html("<h1> test </h1>"), 1);
|
||
|
}
|
||
|
}
|