Skip to main content

flams_search/textify/
parser.rs

1use std::cell::{Cell, RefCell};
2
3#[allow(clippy::wildcard_imports)]
4use super::ever::*;
5use ftml_ontology::narrative::DocumentRange;
6use html5ever::{
7    QualName,
8    interface::{NodeOrText, TreeSink},
9    tendril::{SliceExt, StrTendril, TendrilSink},
10};
11
12enum State {
13    Msup(String, Option<String>, Option<String>),
14    Msub(String, Option<String>, Option<String>),
15    Mfrac(String, Option<String>, Option<String>),
16}
17
18pub struct HtmlParser {
19    pub(crate) document_node: NodeRef,
20    pub(crate) body: Cell<(DocumentRange, usize)>,
21    pub(crate) out: RefCell<String>,
22    states: RefCell<Vec<State>>,
23    in_body: Cell<bool>,
24}
25
26impl HtmlParser {
27    pub fn run(s: &str, inline: bool) -> Result<String, String> {
28        let parser = Self {
29            document_node: super::ever::NodeRef::new_document(),
30            body: Cell::new((DocumentRange::default(), 0)),
31            out: RefCell::new(String::new()),
32            states: RefCell::new(Vec::new()),
33            in_body: Cell::new(inline),
34        };
35        html5ever::parse_document(parser, html5ever::ParseOpts::default())
36            .from_utf8()
37            .one(s.as_bytes().to_tendril())
38    }
39
40    fn newline(&self) {
41        if let Some(State::Msup(out, _, _) | State::Msub(out, _, _) | State::Mfrac(out, _, _)) =
42            self.states.borrow_mut().last_mut()
43        {
44            if !out.is_empty() {
45                out.push('\n');
46            }
47            return;
48        }
49        let mut out = self.out.borrow_mut();
50        if !out.is_empty() {
51            out.push('\n');
52        }
53    }
54
55    fn add(&self, s: &str) {
56        if let Some(State::Msup(out, _, _) | State::Msub(out, _, _) | State::Mfrac(out, _, _)) =
57            self.states.borrow_mut().last_mut()
58        {
59            if !out.is_empty() && !out.ends_with([' ', '\n']) {
60                out.push(' ');
61            }
62            out.push_str(s);
63            return;
64        }
65        let mut out = self.out.borrow_mut();
66        if !out.is_empty() && !out.ends_with([' ', '\n']) {
67            out.push(' ');
68        }
69        out.push_str(s);
70    }
71
72    fn pair(&self, sep: char, a: String, b: String) {
73        if a.is_empty() && b.is_empty() {
74            return;
75        }
76        if b.is_empty() {
77            self.add(&a);
78        }
79        self.add(&a);
80        self.add(&format!("{sep}{{{b}}}"))
81    }
82}
83
84impl TreeSink for HtmlParser {
85    type Handle = NodeRef;
86    type Output = Result<String, String>;
87    type ElemName<'a>
88        = &'a QualName
89    where
90        Self: 'a;
91
92    #[allow(clippy::cast_possible_truncation)]
93    fn finish(self) -> Self::Output {
94        for c in self.document_node.children() {
95            self.pop(&c);
96        }
97        Ok(self.out.into_inner())
98    }
99
100    #[inline]
101    fn parse_error(&self, _: std::borrow::Cow<'static, str>) {}
102
103    #[inline]
104    fn get_document(&self) -> Self::Handle {
105        self.document_node.clone()
106    }
107    #[inline]
108    fn set_quirks_mode(&self, mode: html5ever::interface::QuirksMode) {
109        let NodeData::Document(r) = self.document_node.data() else {
110            unreachable!()
111        };
112        r.set(mode);
113    }
114
115    #[inline]
116    fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
117        x == y
118    }
119
120    #[inline]
121    fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> Self::ElemName<'a> {
122        &target.as_element().unwrap_or_else(|| unreachable!()).name
123    }
124
125    #[inline]
126    fn create_element(
127        &self,
128        name: QualName,
129        attrs: Vec<html5ever::Attribute>,
130        _: html5ever::interface::ElementFlags,
131    ) -> Self::Handle {
132        NodeRef::new_element(name, attrs.into())
133    }
134    #[inline]
135    fn create_comment(&self, text: StrTendril) -> NodeRef {
136        NodeRef::new_comment(text)
137    }
138    #[inline]
139    fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle {
140        NodeRef::new_processing_instruction(target, data)
141    }
142
143    #[allow(clippy::cast_possible_wrap)]
144    #[allow(clippy::too_many_lines)]
145    fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
146        if let Some(e) = parent.last_child() {
147            self.pop(&e);
148        }
149        {
150            if let Some(p) = parent.as_element() {
151                let mut sts = self.states.borrow_mut();
152                let last = sts.last_mut();
153                if &*p.name.local == "msup"
154                    && let Some(State::Msup(s, first, second)) = last
155                {
156                    if first.is_none() || second.is_none() {
157                        *second = first.take();
158                        *first = Some(std::mem::take(s));
159                    }
160                } else if &*p.name.local == "msub"
161                    && let Some(State::Msub(s, first, second)) = last
162                {
163                    if first.is_none() || second.is_none() {
164                        *second = first.take();
165                        *first = Some(std::mem::take(s));
166                    }
167                } else if &*p.name.local == "mfrac"
168                    && let Some(State::Mfrac(s, first, second)) = last
169                {
170                    if first.is_none() || second.is_none() {
171                        *second = first.take();
172                        *first = Some(std::mem::take(s));
173                    }
174                }
175            }
176        }
177        match child {
178            NodeOrText::AppendNode(child) => {
179                if parent.as_document().is_some() {
180                    if let Some(child_elem) = child.as_element() {
181                        let new_start = parent.len();
182                        let len = child.len();
183                        child_elem.start_offset.set(new_start);
184                        child_elem.end_offset.set(new_start + len);
185                    }
186                }
187                if let Some(e) = child.as_element() {
188                    let attrs = e.attributes.borrow();
189                    for a in &attrs.0 {
190                        if &*a.0.local == ftml_parser::FtmlKey::Definition.attr_name() {
191                            self.newline();
192                            self.add("DEFINITION: ");
193                        } else if &*a.0.local == ftml_parser::FtmlKey::Assertion.attr_name() {
194                            self.newline();
195                            self.add("ASSERTION: ");
196                        } else if &*a.0.local == ftml_parser::FtmlKey::Example.attr_name() {
197                            self.newline();
198                            self.add("EXAMPLE: ");
199                        } else if &*a.0.local == ftml_parser::FtmlKey::Problem.attr_name()
200                            || &*a.0.local == ftml_parser::FtmlKey::SubProblem.attr_name()
201                        {
202                            self.newline();
203                            self.add("PROBLEM: ");
204                        }
205                    }
206                    drop(attrs);
207                    if &*e.name.local == "msup" {
208                        self.states
209                            .borrow_mut()
210                            .push(State::Msup(String::new(), None, None));
211                    } else if &*e.name.local == "msub" {
212                        self.states
213                            .borrow_mut()
214                            .push(State::Msub(String::new(), None, None));
215                    } else if &*e.name.local == "mfrac" {
216                        self.states
217                            .borrow_mut()
218                            .push(State::Mfrac(String::new(), None, None));
219                    } else if &*e.name.local == "tr" {
220                        self.newline();
221                    } else if &*e.name.local == "td" {
222                        self.add("|");
223                    } else if &*e.name.local == "body" {
224                        self.in_body.set(true);
225                    }
226                }
227                parent.append(child);
228            }
229            NodeOrText::AppendText(text) => {
230                if let Some(elem) = parent.as_element() {
231                    let len = if matches!(
232                        &*elem.name.local,
233                        "style"
234                            | "script"
235                            | "xmp"
236                            | "iframe"
237                            | "noembed"
238                            | "noframes"
239                            | "plaintext"
240                            | "noscript"
241                    ) {
242                        text.as_bytes().len()
243                    } else {
244                        escaped_len(&text, false)
245                    };
246                    prolong(parent, len as isize);
247                    if self.in_body.get() {
248                        let txt = text.trim();
249                        if !txt.is_empty() {
250                            self.add(txt);
251                        }
252                    }
253                }
254                if let Some(last_child) = parent.last_child()
255                    && let Some(existing) = last_child.as_text()
256                {
257                    existing.borrow_mut().extend(text.chars());
258                    return;
259                }
260                parent.append(NodeRef::new_text(text));
261            }
262        }
263    }
264
265    #[inline]
266    fn append_doctype_to_document(
267        &self,
268        name: StrTendril,
269        public_id: StrTendril,
270        system_id: StrTendril,
271    ) {
272        self.document_node
273            .append(NodeRef::new_doctype(name, public_id, system_id));
274    }
275
276    #[inline]
277    fn append_based_on_parent_node(
278        &self,
279        element: &Self::Handle,
280        prev_element: &Self::Handle,
281        child: NodeOrText<Self::Handle>,
282    ) {
283        if element.parent().is_some() {
284            self.append_before_sibling(element, child);
285        } else {
286            self.append(prev_element, child);
287        }
288    }
289
290    fn pop(&self, node: &Self::Handle) {
291        let Some(elem) = node.as_element() else {
292            return;
293        };
294        if elem.closed.get() {
295            return;
296        }
297        elem.closed.set(true);
298        for c in node.children() {
299            self.pop(&c);
300        }
301        let attrs = elem.attributes.borrow();
302        if attrs.0.iter().any(|a| {
303            [
304                ftml_parser::FtmlKey::Definition.attr_name(),
305                ftml_parser::FtmlKey::Assertion.attr_name(),
306                ftml_parser::FtmlKey::Example.attr_name(),
307                ftml_parser::FtmlKey::Problem.attr_name(),
308                ftml_parser::FtmlKey::SubProblem.attr_name(),
309            ]
310            .contains(&&*a.0.local)
311        }) {
312            self.newline();
313        }
314        drop(attrs);
315
316        let mut sts = self.states.borrow_mut();
317        if &elem.name.local == "body" {
318            let range = DocumentRange {
319                start: elem.start_offset.get(),
320                end: elem.end_offset.get(),
321            };
322            let off = elem.attributes.borrow().len();
323            self.body.set((range, "<body>".len() + off));
324            self.in_body.set(false);
325        } else if &elem.name.local == "msup"
326            && let Some(State::Msup(s, Some(a), _)) = sts.pop()
327        {
328            drop(sts);
329            self.pair('^', a, s);
330        } else if &elem.name.local == "msub"
331            && let Some(State::Msub(s, Some(a), _)) = sts.pop()
332        {
333            drop(sts);
334            self.pair('_', a, s);
335        } else if &elem.name.local == "mfrac"
336            && let Some(State::Mfrac(s, Some(a), _)) = sts.pop()
337        {
338            drop(sts);
339            self.add(&format!("{{{a}}}/{{{s}}}"));
340        }
341    }
342
343    #[inline]
344    fn append_before_sibling(&self, _sibling: &Self::Handle, _child: NodeOrText<Self::Handle>) {
345        unreachable!()
346    }
347
348    #[inline]
349    fn remove_from_parent(&self, _target: &Self::Handle) {
350        unreachable!()
351    }
352    #[inline]
353    fn reparent_children(&self, _node: &Self::Handle, _new_parent: &Self::Handle) {
354        unreachable!()
355    }
356    #[inline]
357    fn mark_script_already_started(&self, _node: &Self::Handle) {
358        unreachable!()
359    }
360    fn get_template_contents(&self, _target: &Self::Handle) -> Self::Handle {
361        unreachable!()
362    }
363    #[inline]
364    fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<html5ever::Attribute>) {
365        if let Some(e) = target.as_element() {
366            let mut ats = e.attributes.borrow_mut();
367            for a in attrs {
368                if let Some(att) = ats.0.iter_mut().find(|att| att.0 == a.name) {
369                    *att = (a.name, a.value);
370                } else {
371                    ats.0.push((a.name, a.value));
372                }
373            }
374        }
375    }
376}
377
378#[allow(clippy::cast_sign_loss)]
379#[allow(clippy::cast_possible_wrap)]
380fn prolong(parent: &NodeRef, len: isize) {
381    if let Some(elem) = parent.as_element() {
382        let end = elem.end_offset.get();
383        elem.end_offset.set(((end as isize) + len) as usize);
384        if let Some(p) = parent.parent() {
385            prolong(&p, len);
386        }
387    }
388}