1mod nodes;
2pub mod termsnotations;
3
4use std::cell::{Cell, RefCell};
5
6use flams_ontology::{
7 languages::Language,
8 narration::{
9 documents::{DocumentStyles, UncheckedDocument},
10 LazyDocRef,
11 },
12 triple,
13 uris::{
14 ArchiveId, ArchiveURI, ArchiveURITrait, BaseURI, DocumentURI, ModuleURI, SymbolURI,
15 URIOrRefTrait, URIRefTrait, URIWithLanguage,
16 },
17 DocumentRange,
18};
19use flams_system::{
20 backend::{AnyBackend, Backend},
21 formats::{HTMLData, OMDocResult},
22};
23use flams_utils::{prelude::HSet, CSS};
24use ftml_extraction::{
25 errors::FTMLError,
26 open::{
27 terms::{OpenTerm, VarOrSym},
28 OpenFTMLElement,
29 },
30 prelude::{
31 all_rules, Attributes, ExtractorState, FTMLElements, FTMLNode, FTMLTag, RuleSet,
32 StatefulExtractor,
33 },
34};
35use html5ever::{
36 interface::{NodeOrText, TreeSink},
37 parse_document,
38 serialize::SerializeOpts,
39 tendril::{SliceExt, StrTendril, TendrilSink},
40 ParseOpts, QualName,
41};
42use nodes::{ElementData, NodeData, NodeRef};
43
44pub struct HTMLParser<'p> {
45 document_node: NodeRef,
46 rel_path: &'p str,
47 extractor: RefCell<Extractor<'p>>,
48 body: Cell<(DocumentRange, usize)>,
49}
50
51struct Extractor<'a> {
52 errors: String,
53 css: Vec<CSS>,
54 refs: Vec<u8>,
55 triples: HSet<flams_ontology::rdf::Triple>,
56 title: Option<Box<str>>,
57 backend: &'a AnyBackend,
59 state: ExtractorState,
60}
61
62impl StatefulExtractor for Extractor<'_> {
63 type Attr<'a> = nodes::Attributes;
64 const RDF: bool = true;
65
66 fn add_resource<T: flams_ontology::Resourcable>(&mut self, t: &T) -> LazyDocRef<T> {
67 struct VecWriter<'a>(&'a mut Vec<u8>);
68 impl bincode::enc::write::Writer for VecWriter<'_> {
69 fn write(&mut self, bytes: &[u8]) -> Result<(), bincode::error::EncodeError> {
70 self.0.extend_from_slice(bytes);
71 Ok(())
72 }
73 }
74 let off = self.refs.len();
75 let _ = bincode::serde::encode_into_writer(
76 t,
77 VecWriter(&mut self.refs),
78 bincode::config::standard(),
79 );
80 LazyDocRef::new(off, self.refs.len(), self.state.document_uri().clone())
81 }
82
83 #[inline]
84 fn state(&self) -> &ExtractorState {
85 &self.state
86 }
87 #[inline]
88 fn state_mut(&mut self) -> &mut ExtractorState {
89 &mut self.state
90 }
91 #[inline]
92 fn set_document_title(&mut self, title: Box<str>) {
93 self.title = Some(title);
94 }
95
96 #[inline]
97 fn add_triples<const N: usize>(&mut self, triples: [flams_ontology::rdf::Triple; N]) {
98 self.triples.extend(triples);
99 }
100 #[inline]
101 fn add_error(&mut self, err: FTMLError) {
102 self.errors.push_str(&(err.to_string() + "\n"));
103 }
104
105 }
113
114impl<'p> HTMLParser<'p> {
115 pub fn run(
116 input: &str,
117 uri: DocumentURI,
118 rel_path: &'p str,
119 backend: &'p AnyBackend,
120 ) -> Result<(OMDocResult, String), String> {
121 let iri = uri.to_iri();
122 let mut triples = HSet::default();
123 for t in [
124 triple!(<(iri.clone())> dc:LANGUAGE = (uri.language().to_string()) ),
125 triple!(<(iri.clone())> : ulo:DOCUMENT),
126 triple!(<(uri.archive_uri().to_iri())> ulo:CONTAINS <(iri)>),
127 ] {
128 triples.insert(t);
129 }
130 parse_document(
138 Self {
139 document_node: NodeRef::new_document(),
140 body: Cell::new((DocumentRange { start: 0, end: 0 }, 0)),
141 rel_path,
142 extractor: RefCell::new(Extractor {
143 backend,
144 triples, errors: String::new(),
146 title: None,
147 css: Vec::new(),
148 refs: Vec::new(),
149 state: ExtractorState::new(uri),
150 }),
151 },
152 ParseOpts::default(),
153 )
154 .from_utf8()
155 .one(input.as_bytes().to_tendril())
156 }
157}
158
159impl TreeSink for HTMLParser<'_> {
160 type Handle = NodeRef;
161 type Output = Result<(OMDocResult, String), String>;
162 type ElemName<'a>
163 = &'a QualName
164 where
165 Self: 'a;
166
167 fn finish(self) -> Self::Output {
168 for c in self.document_node.children() {
169 self.pop(&c);
170 }
171 let mut html = Vec::new();
172 let Extractor {
173 errors,
174 mut css,
175 refs,
176 title,
177 triples,
178 state,
179 backend,
180 ..
181 } = self.extractor.into_inner();
182 if !errors.is_empty() {
183 return Err(errors);
184 }
190 css = CSS::merge(std::mem::take(&mut css));
191 let Ok((uri, elems, modules, styles)) = state.take() else {
193 return Err("Unbalanced FTML document".to_string());
194 };
199
200 let _ = html5ever::serialize(&mut html, &self.document_node, SerializeOpts::default());
201 let html = String::from_utf8_lossy(&html).into();
202 backend.submit_triples(&uri, self.rel_path, triples.into_iter());
203 let (body, inner_offset) = self.body.get();
204 Ok((
205 OMDocResult {
206 document: UncheckedDocument {
207 uri,
208 title,
209 styles,
210 elements: elems,
211 },
212 html: HTMLData {
213 html,
214 css,
215 refs,
216 body,
217 inner_offset,
218 },
219 modules,
220 },
221 errors,
222 ))
223 }
235
236 #[inline]
237 fn parse_error(&self, msg: std::borrow::Cow<'static, str>) {
238 self.extractor.borrow_mut().errors.push_str(&msg);
239 }
240 #[inline]
241 fn get_document(&self) -> Self::Handle {
242 self.document_node.clone()
243 }
244 #[inline]
245 fn set_quirks_mode(&self, mode: html5ever::interface::QuirksMode) {
246 let NodeData::Document(r) = self.document_node.data() else {
247 unreachable!()
248 };
249 r.set(mode);
250 }
251
252 #[inline]
253 fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
254 x == y
255 }
256
257 #[inline]
258 fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> Self::ElemName<'a> {
259 &target.as_element().unwrap_or_else(|| unreachable!()).name
260 }
261
262 #[inline]
263 fn create_element(
264 &self,
265 name: QualName,
266 attrs: Vec<html5ever::Attribute>,
267 _: html5ever::interface::ElementFlags,
268 ) -> Self::Handle {
269 NodeRef::new_element(name, attrs.into())
270 }
271 #[inline]
272 fn create_comment(&self, text: StrTendril) -> NodeRef {
273 NodeRef::new_comment(text)
274 }
275 #[inline]
276 fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle {
277 NodeRef::new_processing_instruction(target, data)
278 }
279
280 #[allow(clippy::cast_possible_wrap)]
281 #[allow(clippy::too_many_lines)]
282 fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
283 if let Some(e) = parent.last_child() {
284 self.pop(&e);
285 }
286 match child {
289 NodeOrText::AppendNode(child) => {
290 if child
291 .as_element()
292 .is_some_and(|n| n.name.local.as_ref().eq_ignore_ascii_case("img"))
293 {
294 let Some(child_elem) = child.as_element() else {
295 unreachable!()
296 };
297 let mut attributes = child_elem.attributes.borrow_mut();
298 if let Some(src) = attributes.value("src") {
299 let path = std::path::Path::new(src);
300 if let Some(newsrc) =
301 self.extractor.borrow().backend.archive_of(path, |a, rp| {
302 format!("srv:/img?a={}&rp={}", a.id(), &rp[1..])
303 })
304 {
305 attributes.set("src", "");
306 attributes.new_attr("data-flams-src", newsrc);
307 } else {
308 let kpsewhich = &*tex_engine::engine::filesystem::kpathsea::KPATHSEA;
309 let last = src.rsplit_once('/').map_or(src, |(_, p)| p);
310 if let Some(file) = kpsewhich.which(last) {
311 if file == path {
312 let file = format!("srv:/img?kpse={last}");
313 attributes.set("src", "");
314 attributes.new_attr("data-flams-src", file);
315 }
316 } else {
317 let file = format!("srv:/img?file={src}");
318 attributes.set("src", "");
319 attributes.new_attr("data-flams-src", file);
320 }
321 };
323 }
324 drop(attributes);
325 NodeRef::update_len(child_elem);
326 }
327 if parent.as_document().is_some() {
330 if let Some(child_elem) = child.as_element() {
331 let new_start = parent.len();
332 let len = child.len();
333 child_elem.start_offset.set(new_start);
334 child_elem.end_offset.set(new_start + len);
335 }
336 } else if let Some(parent_elem) = parent.as_element() {
337 let new_start =
338 parent_elem.end_offset.get() - nodes::tag_len(&parent_elem.name) - 1;
339 if let Some(child_elem) = child.as_element() {
340 {
341 let mut attributes = child_elem.attributes.borrow_mut();
342 let mut extractor = self.extractor.borrow_mut();
343 if let Some(elements) =
344 all_rules().applicable_rules(&mut *extractor, &mut *attributes)
345 {
346 drop(attributes);
347 update_attributes(&elements, child_elem);
348 child_elem.ftml.set(Some(elements));
349 } else {
350 drop(attributes);
351 NodeRef::update_len(child_elem);
352 }
353 }
354 let len = child.len();
355 child_elem.start_offset.set(new_start);
356 child_elem.end_offset.set(new_start + len);
357 }
358 prolong(parent, child.len() as isize);
361 }
362 parent.append(child);
363 }
366 NodeOrText::AppendText(text) => {
367 if let Some(elem) = parent.as_element() {
368 let len = if matches!(
369 &*elem.name.local,
370 "style"
371 | "script"
372 | "xmp"
373 | "iframe"
374 | "noembed"
375 | "noframes"
376 | "plaintext"
377 | "noscript"
378 ) {
379 text.as_bytes().len()
380 } else {
381 nodes::escaped_len(&text, false)
382 };
383 prolong(parent, len as isize);
384 }
385 if let Some(last_child) = parent.last_child() {
386 if let Some(existing) = last_child.as_text() {
387 existing.borrow_mut().extend(text.chars());
388 return;
389 }
390 }
391 parent.append(NodeRef::new_text(text));
392 }
394 }
395 }
396
397 #[inline]
398 fn append_doctype_to_document(
399 &self,
400 name: StrTendril,
401 public_id: StrTendril,
402 system_id: StrTendril,
403 ) {
404 self.document_node
405 .append(NodeRef::new_doctype(name, public_id, system_id));
406 }
407
408 #[inline]
409 fn append_based_on_parent_node(
410 &self,
411 element: &Self::Handle,
412 prev_element: &Self::Handle,
413 child: NodeOrText<Self::Handle>,
414 ) {
415 if element.parent().is_some() {
416 self.append_before_sibling(element, child);
417 } else {
418 self.append(prev_element, child);
419 }
420 }
421
422 fn pop(&self, node: &Self::Handle) {
423 let Some(elem) = node.as_element() else {
424 return;
425 };
426 if elem.closed.get() {
427 return;
428 }
429 elem.closed.set(true);
430 for c in node.children() {
431 self.pop(&c);
432 }
433 if &elem.name.local == "body" {
434 let range = DocumentRange {
435 start: elem.start_offset.get(),
436 end: elem.end_offset.get(),
437 };
438 let off = elem.attributes.borrow().len();
439 self.body.set((range, "<body>".len() + off));
440 } else if matches!(&*elem.name.local, "link" | "style") {
441 if let Some(p) = node.parent() {
442 if let Some(pe) = p.as_element() {
443 if &pe.name.local == "head" {
444 match &*elem.name.local {
445 "link" => {
446 let attrs = elem.attributes.borrow();
447 if attrs.value("rel") == Some("stylesheet") {
448 if let Some(lnk) = attrs.value("href") {
449 let val = CSS_SUBSTS.get(lnk).map_or_else(
450 || lnk.to_string().into_boxed_str(),
451 |v| (*v).to_string().into_boxed_str(),
452 );
453 self.extractor.borrow_mut().css.push(CSS::Link(val));
454 node.delete();
455 return;
456 }
457 }
458 }
459 "style" => {
460 let str = node
461 .children()
462 .filter_map(|c| c.as_text().map(|s| s.borrow().to_string()))
463 .collect::<String>();
464 self.extractor.borrow_mut().css.push(CSS::Inline(str.into()));node.delete();
467 return;
468 }
469 _ => unreachable!(),
470 }
471 }
472 }
473 }
474 }
475 if let Some(mut elems) = elem.ftml.take() {
476 let mut extractor = self.extractor.borrow_mut();
477 elems.close(&mut *extractor, node);
478 if !elems.is_empty() {
479 elem.ftml.set(Some(elems));
480 }
481 }
482 }
483
484 #[inline]
485 fn append_before_sibling(&self, _sibling: &Self::Handle, _child: NodeOrText<Self::Handle>) {
486 unreachable!()
487 }
502
503 #[inline]
504 fn remove_from_parent(&self, _target: &Self::Handle) {
505 unreachable!()
506 }
507 #[inline]
508 fn reparent_children(&self, _node: &Self::Handle, _new_parent: &Self::Handle) {
509 unreachable!()
510 }
511 #[inline]
512 fn mark_script_already_started(&self, _node: &Self::Handle) {
513 unreachable!()
514 }
515 fn get_template_contents(&self, _target: &Self::Handle) -> Self::Handle {
516 unreachable!()
517 }
518 #[inline]
519 fn add_attrs_if_missing(&self, _target: &Self::Handle, _attrs: Vec<html5ever::Attribute>) {
520 unreachable!()
521 }
522}
523
524static CSS_SUBSTS: phf::Map<&'static str, &'static str> = phf::phf_map! {
525 "https://raw.githack.com/Jazzpirate/RusTeX/main/rustex/src/resources/rustex.css"
526 => "srv:/rustex.css"
527};
528
529fn update_attributes(elements: &FTMLElements, child: &ElementData) {
530 let mut attrs = child.attributes.borrow_mut();
531 for e in &elements.elems {
532 match e {
533 OpenFTMLElement::ImportModule(uri) => attrs.update(FTMLTag::ImportModule, uri),
534 OpenFTMLElement::UseModule(uri) => attrs.update(FTMLTag::UseModule, uri),
535 OpenFTMLElement::MathStructure { uri, .. } => {
536 attrs.update(FTMLTag::MathStructure, &uri.clone().into_module());
537 }
538 OpenFTMLElement::Morphism { uri, domain, .. } => {
539 attrs.update(FTMLTag::MorphismDomain, domain);
540 attrs.update(FTMLTag::Morphism, &uri.clone().into_module());
541 }
542 OpenFTMLElement::Assign(uri) => {
543 attrs.update(FTMLTag::Assign, uri);
544 }
545 OpenFTMLElement::Symdecl { uri, .. } => {
547 attrs.update(FTMLTag::Symdecl, uri);
548 }
549 OpenFTMLElement::Notation {
550 symbol: VarOrSym::S(uri),
551 ..
552 } => {
553 attrs.update(FTMLTag::Notation, uri);
554 }
555 OpenFTMLElement::Definiendum(uri) => {
556 attrs.update(FTMLTag::Definiendum, uri);
557 }
558 OpenFTMLElement::Conclusion { uri, .. } => {
559 attrs.update(FTMLTag::Conclusion, uri);
560 }
561 OpenFTMLElement::Definiens { uri: Some(uri), .. } => {
562 attrs.update(FTMLTag::Definiens, uri);
563 }
564 OpenFTMLElement::Inputref { uri, .. } => {
565 attrs.update(FTMLTag::InputRef, uri);
566 }
567 OpenFTMLElement::OpenTerm {
568 term:
569 OpenTerm::Symref { uri, .. }
570 | OpenTerm::OMA {
571 head: VarOrSym::S(uri),
572 ..
573 }
574 | OpenTerm::Complex(VarOrSym::S(uri), ..),
575 ..
576 } => attrs.update(FTMLTag::Head, uri),
577 _ => (),
578 }
579 }
580 drop(attrs);
581 NodeRef::update_len(child);
582}
583
584#[allow(clippy::cast_sign_loss)]
585#[allow(clippy::cast_possible_wrap)]
586fn prolong(parent: &NodeRef, len: isize) {
587 if let Some(elem) = parent.as_element() {
588 let end = elem.end_offset.get();
589 elem.end_offset.set(((end as isize) + len) as usize);
590 if let Some(p) = parent.parent() {
591 prolong(&p, len);
592 }
593 }
594}