1use std::cell::{Cell, RefCell};
2
3#[allow(clippy::wildcard_imports)]
4use super::ever::*;
5use ftml_ontology::narrative::DocumentRange;
6use html5ever::{
7 QualName,
8 interface::{NodeOrText, TreeSink},
9 tendril::{SliceExt, StrTendril, TendrilSink},
10};
11
12enum State {
13 Msup(String, Option<String>, Option<String>),
14 Msub(String, Option<String>, Option<String>),
15 Mfrac(String, Option<String>, Option<String>),
16}
17
18pub struct HtmlParser {
19 pub(crate) document_node: NodeRef,
20 pub(crate) body: Cell<(DocumentRange, usize)>,
21 pub(crate) out: RefCell<String>,
22 states: RefCell<Vec<State>>,
23 in_body: Cell<bool>,
24}
25
26impl HtmlParser {
27 pub fn run(s: &str, inline: bool) -> Result<String, String> {
28 let parser = Self {
29 document_node: super::ever::NodeRef::new_document(),
30 body: Cell::new((DocumentRange::default(), 0)),
31 out: RefCell::new(String::new()),
32 states: RefCell::new(Vec::new()),
33 in_body: Cell::new(inline),
34 };
35 html5ever::parse_document(parser, html5ever::ParseOpts::default())
36 .from_utf8()
37 .one(s.as_bytes().to_tendril())
38 }
39
40 fn newline(&self) {
41 if let Some(State::Msup(out, _, _) | State::Msub(out, _, _) | State::Mfrac(out, _, _)) =
42 self.states.borrow_mut().last_mut()
43 {
44 if !out.is_empty() {
45 out.push('\n');
46 }
47 return;
48 }
49 let mut out = self.out.borrow_mut();
50 if !out.is_empty() {
51 out.push('\n');
52 }
53 }
54
55 fn add(&self, s: &str) {
56 if let Some(State::Msup(out, _, _) | State::Msub(out, _, _) | State::Mfrac(out, _, _)) =
57 self.states.borrow_mut().last_mut()
58 {
59 if !out.is_empty() && !out.ends_with([' ', '\n']) {
60 out.push(' ');
61 }
62 out.push_str(s);
63 return;
64 }
65 let mut out = self.out.borrow_mut();
66 if !out.is_empty() && !out.ends_with([' ', '\n']) {
67 out.push(' ');
68 }
69 out.push_str(s);
70 }
71
72 fn pair(&self, sep: char, a: String, b: String) {
73 if a.is_empty() && b.is_empty() {
74 return;
75 }
76 if b.is_empty() {
77 self.add(&a);
78 }
79 self.add(&a);
80 self.add(&format!("{sep}{{{b}}}"))
81 }
82}
83
84impl TreeSink for HtmlParser {
85 type Handle = NodeRef;
86 type Output = Result<String, String>;
87 type ElemName<'a>
88 = &'a QualName
89 where
90 Self: 'a;
91
92 #[allow(clippy::cast_possible_truncation)]
93 fn finish(self) -> Self::Output {
94 for c in self.document_node.children() {
95 self.pop(&c);
96 }
97 Ok(self.out.into_inner())
98 }
99
100 #[inline]
101 fn parse_error(&self, _: std::borrow::Cow<'static, str>) {}
102
103 #[inline]
104 fn get_document(&self) -> Self::Handle {
105 self.document_node.clone()
106 }
107 #[inline]
108 fn set_quirks_mode(&self, mode: html5ever::interface::QuirksMode) {
109 let NodeData::Document(r) = self.document_node.data() else {
110 unreachable!()
111 };
112 r.set(mode);
113 }
114
115 #[inline]
116 fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
117 x == y
118 }
119
120 #[inline]
121 fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> Self::ElemName<'a> {
122 &target.as_element().unwrap_or_else(|| unreachable!()).name
123 }
124
125 #[inline]
126 fn create_element(
127 &self,
128 name: QualName,
129 attrs: Vec<html5ever::Attribute>,
130 _: html5ever::interface::ElementFlags,
131 ) -> Self::Handle {
132 NodeRef::new_element(name, attrs.into())
133 }
134 #[inline]
135 fn create_comment(&self, text: StrTendril) -> NodeRef {
136 NodeRef::new_comment(text)
137 }
138 #[inline]
139 fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle {
140 NodeRef::new_processing_instruction(target, data)
141 }
142
143 #[allow(clippy::cast_possible_wrap)]
144 #[allow(clippy::too_many_lines)]
145 fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
146 if let Some(e) = parent.last_child() {
147 self.pop(&e);
148 }
149 {
150 if let Some(p) = parent.as_element() {
151 let mut sts = self.states.borrow_mut();
152 let last = sts.last_mut();
153 if &*p.name.local == "msup"
154 && let Some(State::Msup(s, first, second)) = last
155 {
156 if first.is_none() || second.is_none() {
157 *second = first.take();
158 *first = Some(std::mem::take(s));
159 }
160 } else if &*p.name.local == "msub"
161 && let Some(State::Msub(s, first, second)) = last
162 {
163 if first.is_none() || second.is_none() {
164 *second = first.take();
165 *first = Some(std::mem::take(s));
166 }
167 } else if &*p.name.local == "mfrac"
168 && let Some(State::Mfrac(s, first, second)) = last
169 {
170 if first.is_none() || second.is_none() {
171 *second = first.take();
172 *first = Some(std::mem::take(s));
173 }
174 }
175 }
176 }
177 match child {
178 NodeOrText::AppendNode(child) => {
179 if parent.as_document().is_some() {
180 if let Some(child_elem) = child.as_element() {
181 let new_start = parent.len();
182 let len = child.len();
183 child_elem.start_offset.set(new_start);
184 child_elem.end_offset.set(new_start + len);
185 }
186 }
187 if let Some(e) = child.as_element() {
188 let attrs = e.attributes.borrow();
189 for a in &attrs.0 {
190 if &*a.0.local == ftml_parser::FtmlKey::Definition.attr_name() {
191 self.newline();
192 self.add("DEFINITION: ");
193 } else if &*a.0.local == ftml_parser::FtmlKey::Assertion.attr_name() {
194 self.newline();
195 self.add("ASSERTION: ");
196 } else if &*a.0.local == ftml_parser::FtmlKey::Example.attr_name() {
197 self.newline();
198 self.add("EXAMPLE: ");
199 } else if &*a.0.local == ftml_parser::FtmlKey::Problem.attr_name()
200 || &*a.0.local == ftml_parser::FtmlKey::SubProblem.attr_name()
201 {
202 self.newline();
203 self.add("PROBLEM: ");
204 }
205 }
206 drop(attrs);
207 if &*e.name.local == "msup" {
208 self.states
209 .borrow_mut()
210 .push(State::Msup(String::new(), None, None));
211 } else if &*e.name.local == "msub" {
212 self.states
213 .borrow_mut()
214 .push(State::Msub(String::new(), None, None));
215 } else if &*e.name.local == "mfrac" {
216 self.states
217 .borrow_mut()
218 .push(State::Mfrac(String::new(), None, None));
219 } else if &*e.name.local == "tr" {
220 self.newline();
221 } else if &*e.name.local == "td" {
222 self.add("|");
223 } else if &*e.name.local == "body" {
224 self.in_body.set(true);
225 }
226 }
227 parent.append(child);
228 }
229 NodeOrText::AppendText(text) => {
230 if let Some(elem) = parent.as_element() {
231 let len = if matches!(
232 &*elem.name.local,
233 "style"
234 | "script"
235 | "xmp"
236 | "iframe"
237 | "noembed"
238 | "noframes"
239 | "plaintext"
240 | "noscript"
241 ) {
242 text.as_bytes().len()
243 } else {
244 escaped_len(&text, false)
245 };
246 prolong(parent, len as isize);
247 if self.in_body.get() {
248 let txt = text.trim();
249 if !txt.is_empty() {
250 self.add(txt);
251 }
252 }
253 }
254 if let Some(last_child) = parent.last_child()
255 && let Some(existing) = last_child.as_text()
256 {
257 existing.borrow_mut().extend(text.chars());
258 return;
259 }
260 parent.append(NodeRef::new_text(text));
261 }
262 }
263 }
264
265 #[inline]
266 fn append_doctype_to_document(
267 &self,
268 name: StrTendril,
269 public_id: StrTendril,
270 system_id: StrTendril,
271 ) {
272 self.document_node
273 .append(NodeRef::new_doctype(name, public_id, system_id));
274 }
275
276 #[inline]
277 fn append_based_on_parent_node(
278 &self,
279 element: &Self::Handle,
280 prev_element: &Self::Handle,
281 child: NodeOrText<Self::Handle>,
282 ) {
283 if element.parent().is_some() {
284 self.append_before_sibling(element, child);
285 } else {
286 self.append(prev_element, child);
287 }
288 }
289
290 fn pop(&self, node: &Self::Handle) {
291 let Some(elem) = node.as_element() else {
292 return;
293 };
294 if elem.closed.get() {
295 return;
296 }
297 elem.closed.set(true);
298 for c in node.children() {
299 self.pop(&c);
300 }
301 let attrs = elem.attributes.borrow();
302 if attrs.0.iter().any(|a| {
303 [
304 ftml_parser::FtmlKey::Definition.attr_name(),
305 ftml_parser::FtmlKey::Assertion.attr_name(),
306 ftml_parser::FtmlKey::Example.attr_name(),
307 ftml_parser::FtmlKey::Problem.attr_name(),
308 ftml_parser::FtmlKey::SubProblem.attr_name(),
309 ]
310 .contains(&&*a.0.local)
311 }) {
312 self.newline();
313 }
314 drop(attrs);
315
316 let mut sts = self.states.borrow_mut();
317 if &elem.name.local == "body" {
318 let range = DocumentRange {
319 start: elem.start_offset.get(),
320 end: elem.end_offset.get(),
321 };
322 let off = elem.attributes.borrow().len();
323 self.body.set((range, "<body>".len() + off));
324 self.in_body.set(false);
325 } else if &elem.name.local == "msup"
326 && let Some(State::Msup(s, Some(a), _)) = sts.pop()
327 {
328 drop(sts);
329 self.pair('^', a, s);
330 } else if &elem.name.local == "msub"
331 && let Some(State::Msub(s, Some(a), _)) = sts.pop()
332 {
333 drop(sts);
334 self.pair('_', a, s);
335 } else if &elem.name.local == "mfrac"
336 && let Some(State::Mfrac(s, Some(a), _)) = sts.pop()
337 {
338 drop(sts);
339 self.add(&format!("{{{a}}}/{{{s}}}"));
340 }
341 }
342
343 #[inline]
344 fn append_before_sibling(&self, _sibling: &Self::Handle, _child: NodeOrText<Self::Handle>) {
345 unreachable!()
346 }
347
348 #[inline]
349 fn remove_from_parent(&self, _target: &Self::Handle) {
350 unreachable!()
351 }
352 #[inline]
353 fn reparent_children(&self, _node: &Self::Handle, _new_parent: &Self::Handle) {
354 unreachable!()
355 }
356 #[inline]
357 fn mark_script_already_started(&self, _node: &Self::Handle) {
358 unreachable!()
359 }
360 fn get_template_contents(&self, _target: &Self::Handle) -> Self::Handle {
361 unreachable!()
362 }
363 #[inline]
364 fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<html5ever::Attribute>) {
365 if let Some(e) = target.as_element() {
366 let mut ats = e.attributes.borrow_mut();
367 for a in attrs {
368 if let Some(att) = ats.0.iter_mut().find(|att| att.0 == a.name) {
369 *att = (a.name, a.value);
370 } else {
371 ats.0.push((a.name, a.value));
372 }
373 }
374 }
375 }
376}
377
378#[allow(clippy::cast_sign_loss)]
379#[allow(clippy::cast_possible_wrap)]
380fn prolong(parent: &NodeRef, len: isize) {
381 if let Some(elem) = parent.as_element() {
382 let end = elem.end_offset.get();
383 elem.end_offset.set(((end as isize) + len) as usize);
384 if let Some(p) = parent.parent() {
385 prolong(&p, len);
386 }
387 }
388}