flams_ftml/parser/
mod.rs

1mod nodes;
2pub mod termsnotations;
3
4use std::cell::{Cell, RefCell};
5
6use flams_ontology::{
7    languages::Language,
8    narration::{
9        documents::{DocumentStyles, UncheckedDocument},
10        LazyDocRef,
11    },
12    triple,
13    uris::{
14        ArchiveId, ArchiveURI, ArchiveURITrait, BaseURI, DocumentURI, ModuleURI, SymbolURI,
15        URIOrRefTrait, URIRefTrait, URIWithLanguage,
16    },
17    DocumentRange,
18};
19use flams_system::{
20    backend::{AnyBackend, Backend},
21    formats::{HTMLData, OMDocResult},
22};
23use flams_utils::{prelude::HSet, CSS};
24use ftml_extraction::{
25    errors::FTMLError,
26    open::{
27        terms::{OpenTerm, VarOrSym},
28        OpenFTMLElement,
29    },
30    prelude::{
31        all_rules, Attributes, ExtractorState, FTMLElements, FTMLNode, FTMLTag, RuleSet,
32        StatefulExtractor,
33    },
34};
35use html5ever::{
36    interface::{NodeOrText, TreeSink},
37    parse_document,
38    serialize::SerializeOpts,
39    tendril::{SliceExt, StrTendril, TendrilSink},
40    ParseOpts, QualName,
41};
42use nodes::{ElementData, NodeData, NodeRef};
43
44pub struct HTMLParser<'p> {
45    document_node: NodeRef,
46    rel_path: &'p str,
47    extractor: RefCell<Extractor<'p>>,
48    body: Cell<(DocumentRange, usize)>,
49}
50
51struct Extractor<'a> {
52    errors: String,
53    css: Vec<CSS>,
54    refs: Vec<u8>,
55    triples: HSet<flams_ontology::rdf::Triple>,
56    title: Option<Box<str>>,
57    //document:UncheckedDocument,
58    backend: &'a AnyBackend,
59    state: ExtractorState,
60}
61
62impl StatefulExtractor for Extractor<'_> {
63    type Attr<'a> = nodes::Attributes;
64    const RDF: bool = true;
65
66    fn add_resource<T: flams_ontology::Resourcable>(&mut self, t: &T) -> LazyDocRef<T> {
67        struct VecWriter<'a>(&'a mut Vec<u8>);
68        impl bincode::enc::write::Writer for VecWriter<'_> {
69            fn write(&mut self, bytes: &[u8]) -> Result<(), bincode::error::EncodeError> {
70                self.0.extend_from_slice(bytes);
71                Ok(())
72            }
73        }
74        let off = self.refs.len();
75        let _ = bincode::serde::encode_into_writer(
76            t,
77            VecWriter(&mut self.refs),
78            bincode::config::standard(),
79        );
80        LazyDocRef::new(off, self.refs.len(), self.state.document_uri().clone())
81    }
82
83    #[inline]
84    fn state(&self) -> &ExtractorState {
85        &self.state
86    }
87    #[inline]
88    fn state_mut(&mut self) -> &mut ExtractorState {
89        &mut self.state
90    }
91    #[inline]
92    fn set_document_title(&mut self, title: Box<str>) {
93        self.title = Some(title);
94    }
95
96    #[inline]
97    fn add_triples<const N: usize>(&mut self, triples: [flams_ontology::rdf::Triple; N]) {
98        self.triples.extend(triples);
99    }
100    #[inline]
101    fn add_error(&mut self, err: FTMLError) {
102        self.errors.push_str(&(err.to_string() + "\n"));
103    }
104
105    /*
106      fn resolve_variable_name(&self,_name:&Name) -> Var {todo!()}
107      fn in_notation(&self) -> bool {todo!()}
108      fn set_in_notation(&mut self,_value:bool) {todo!()}
109      fn in_term(&self) -> bool {todo!()}
110      fn set_in_term(&mut self,_value:bool) {todo!()}
111    */
112}
113
114impl<'p> HTMLParser<'p> {
115    pub fn run(
116        input: &str,
117        uri: DocumentURI,
118        rel_path: &'p str,
119        backend: &'p AnyBackend,
120    ) -> Result<(OMDocResult, String), String> {
121        let iri = uri.to_iri();
122        let mut triples = HSet::default();
123        for t in [
124            triple!(<(iri.clone())> dc:LANGUAGE = (uri.language().to_string()) ),
125            triple!(<(iri.clone())> : ulo:DOCUMENT),
126            triple!(<(uri.archive_uri().to_iri())> ulo:CONTAINS <(iri)>),
127        ] {
128            triples.insert(t);
129        }
130        /*
131        let document = UncheckedDocument {
132          uri,
133          title:None,
134          elements:Vec::new()
135        };*/
136
137        parse_document(
138            Self {
139                document_node: NodeRef::new_document(),
140                body: Cell::new((DocumentRange { start: 0, end: 0 }, 0)),
141                rel_path,
142                extractor: RefCell::new(Extractor {
143                    backend,
144                    triples, //document,
145                    errors: String::new(),
146                    title: None,
147                    css: Vec::new(),
148                    refs: Vec::new(),
149                    state: ExtractorState::new(uri),
150                }),
151            },
152            ParseOpts::default(),
153        )
154        .from_utf8()
155        .one(input.as_bytes().to_tendril())
156    }
157}
158
159impl TreeSink for HTMLParser<'_> {
160    type Handle = NodeRef;
161    type Output = Result<(OMDocResult, String), String>;
162    type ElemName<'a>
163        = &'a QualName
164    where
165        Self: 'a;
166
167    fn finish(self) -> Self::Output {
168        for c in self.document_node.children() {
169            self.pop(&c);
170        }
171        let mut html = Vec::new();
172        let Extractor {
173            errors,
174            mut css,
175            refs,
176            title,
177            triples,
178            state,
179            backend,
180            ..
181        } = self.extractor.into_inner();
182        if !errors.is_empty() {
183            return Err(errors);
184            /*
185            return BuildResult {
186              log:Either::Left(errors),
187              result:Err(Vec::new())
188            } */
189        }
190        css = CSS::merge(std::mem::take(&mut css));
191        //css.sort();
192        let Ok((uri, elems, modules, styles)) = state.take() else {
193            return Err("Unbalanced FTML document".to_string());
194            /*return BuildResult {
195              log:Either::Left("Unbalanced FTML document".to_string()),
196              result:Err(Vec::new())
197            }*/
198        };
199
200        let _ = html5ever::serialize(&mut html, &self.document_node, SerializeOpts::default());
201        let html = String::from_utf8_lossy(&html).into();
202        backend.submit_triples(&uri, self.rel_path, triples.into_iter());
203        let (body, inner_offset) = self.body.get();
204        Ok((
205            OMDocResult {
206                document: UncheckedDocument {
207                    uri,
208                    title,
209                    styles,
210                    elements: elems,
211                },
212                html: HTMLData {
213                    html,
214                    css,
215                    refs,
216                    body,
217                    inner_offset,
218                },
219                modules,
220            },
221            errors,
222        ))
223        /*
224        BuildResult {
225          log:Either::Left(errors),
226          result:Ok(BuildResultArtifact::Data(Box::new(OMDocResult {
227            document: UncheckedDocument {
228              uri,title,elements:elems
229            },
230            html,css,refs,modules,
231            body,inner_offset
232          })))
233        } */
234    }
235
236    #[inline]
237    fn parse_error(&self, msg: std::borrow::Cow<'static, str>) {
238        self.extractor.borrow_mut().errors.push_str(&msg);
239    }
240    #[inline]
241    fn get_document(&self) -> Self::Handle {
242        self.document_node.clone()
243    }
244    #[inline]
245    fn set_quirks_mode(&self, mode: html5ever::interface::QuirksMode) {
246        let NodeData::Document(r) = self.document_node.data() else {
247            unreachable!()
248        };
249        r.set(mode);
250    }
251
252    #[inline]
253    fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
254        x == y
255    }
256
257    #[inline]
258    fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> Self::ElemName<'a> {
259        &target.as_element().unwrap_or_else(|| unreachable!()).name
260    }
261
262    #[inline]
263    fn create_element(
264        &self,
265        name: QualName,
266        attrs: Vec<html5ever::Attribute>,
267        _: html5ever::interface::ElementFlags,
268    ) -> Self::Handle {
269        NodeRef::new_element(name, attrs.into())
270    }
271    #[inline]
272    fn create_comment(&self, text: StrTendril) -> NodeRef {
273        NodeRef::new_comment(text)
274    }
275    #[inline]
276    fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle {
277        NodeRef::new_processing_instruction(target, data)
278    }
279
280    #[allow(clippy::cast_possible_wrap)]
281    #[allow(clippy::too_many_lines)]
282    fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
283        if let Some(e) = parent.last_child() {
284            self.pop(&e);
285        }
286        //println!("Current parent: {}: >>>>{}<<<<",parent.len(),parent.string().replace('\n'," ").replace('\t'," "));
287        //assert_eq!(parent.len(),parent.string().len());
288        match child {
289            NodeOrText::AppendNode(child) => {
290                if child
291                    .as_element()
292                    .is_some_and(|n| n.name.local.as_ref().eq_ignore_ascii_case("img"))
293                {
294                    let Some(child_elem) = child.as_element() else {
295                        unreachable!()
296                    };
297                    let mut attributes = child_elem.attributes.borrow_mut();
298                    if let Some(src) = attributes.value("src") {
299                        let path = std::path::Path::new(src);
300                        if let Some(newsrc) =
301                            self.extractor.borrow().backend.archive_of(path, |a, rp| {
302                                format!("srv:/img?a={}&rp={}", a.id(), &rp[1..])
303                            })
304                        {
305                            attributes.set("src", "");
306                            attributes.new_attr("data-flams-src", newsrc);
307                        } else {
308                            let kpsewhich = &*tex_engine::engine::filesystem::kpathsea::KPATHSEA;
309                            let last = src.rsplit_once('/').map_or(src, |(_, p)| p);
310                            if let Some(file) = kpsewhich.which(last) {
311                                if file == path {
312                                    let file = format!("srv:/img?kpse={last}");
313                                    attributes.set("src", "");
314                                    attributes.new_attr("data-flams-src", file);
315                                }
316                            } else {
317                                let file = format!("srv:/img?file={src}");
318                                attributes.set("src", "");
319                                attributes.new_attr("data-flams-src", file);
320                            }
321                            // TODO
322                        };
323                    }
324                    drop(attributes);
325                    NodeRef::update_len(child_elem);
326                }
327                //println!("Current Child: {}: >>>>{}<<<<",child.len(),child.string().replace('\n'," ").replace('\t'," "));
328                //assert_eq!(child.len(),child.string().len());
329                if parent.as_document().is_some() {
330                    if let Some(child_elem) = child.as_element() {
331                        let new_start = parent.len();
332                        let len = child.len();
333                        child_elem.start_offset.set(new_start);
334                        child_elem.end_offset.set(new_start + len);
335                    }
336                } else if let Some(parent_elem) = parent.as_element() {
337                    let new_start =
338                        parent_elem.end_offset.get() - nodes::tag_len(&parent_elem.name) - 1;
339                    if let Some(child_elem) = child.as_element() {
340                        {
341                            let mut attributes = child_elem.attributes.borrow_mut();
342                            let mut extractor = self.extractor.borrow_mut();
343                            if let Some(elements) =
344                                all_rules().applicable_rules(&mut *extractor, &mut *attributes)
345                            {
346                                drop(attributes);
347                                update_attributes(&elements, child_elem);
348                                child_elem.ftml.set(Some(elements));
349                            } else {
350                                drop(attributes);
351                                NodeRef::update_len(child_elem);
352                            }
353                        }
354                        let len = child.len();
355                        child_elem.start_offset.set(new_start);
356                        child_elem.end_offset.set(new_start + len);
357                    }
358                    //println!("Updated Child: {}: >>>>{}<<<<",child.len(),child.string().replace('\n'," ").replace('\t'," "));
359                    //assert_eq!(child.len(),child.string().len());
360                    prolong(parent, child.len() as isize);
361                }
362                parent.append(child);
363                //println!("New parent: {}: >>>>{}<<<<",parent.len(),parent.string().replace('\n'," ").replace('\t'," "));
364                //assert_eq!(parent.len(),parent.string().len());
365            }
366            NodeOrText::AppendText(text) => {
367                if let Some(elem) = parent.as_element() {
368                    let len = if matches!(
369                        &*elem.name.local,
370                        "style"
371                            | "script"
372                            | "xmp"
373                            | "iframe"
374                            | "noembed"
375                            | "noframes"
376                            | "plaintext"
377                            | "noscript"
378                    ) {
379                        text.as_bytes().len()
380                    } else {
381                        nodes::escaped_len(&text, false)
382                    };
383                    prolong(parent, len as isize);
384                }
385                if let Some(last_child) = parent.last_child() {
386                    if let Some(existing) = last_child.as_text() {
387                        existing.borrow_mut().extend(text.chars());
388                        return;
389                    }
390                }
391                parent.append(NodeRef::new_text(text));
392                //assert_eq!(parent.len(),parent.string().len());
393            }
394        }
395    }
396
397    #[inline]
398    fn append_doctype_to_document(
399        &self,
400        name: StrTendril,
401        public_id: StrTendril,
402        system_id: StrTendril,
403    ) {
404        self.document_node
405            .append(NodeRef::new_doctype(name, public_id, system_id));
406    }
407
408    #[inline]
409    fn append_based_on_parent_node(
410        &self,
411        element: &Self::Handle,
412        prev_element: &Self::Handle,
413        child: NodeOrText<Self::Handle>,
414    ) {
415        if element.parent().is_some() {
416            self.append_before_sibling(element, child);
417        } else {
418            self.append(prev_element, child);
419        }
420    }
421
422    fn pop(&self, node: &Self::Handle) {
423        let Some(elem) = node.as_element() else {
424            return;
425        };
426        if elem.closed.get() {
427            return;
428        }
429        elem.closed.set(true);
430        for c in node.children() {
431            self.pop(&c);
432        }
433        if &elem.name.local == "body" {
434            let range = DocumentRange {
435                start: elem.start_offset.get(),
436                end: elem.end_offset.get(),
437            };
438            let off = elem.attributes.borrow().len();
439            self.body.set((range, "<body>".len() + off));
440        } else if matches!(&*elem.name.local, "link" | "style") {
441            if let Some(p) = node.parent() {
442                if let Some(pe) = p.as_element() {
443                    if &pe.name.local == "head" {
444                        match &*elem.name.local {
445                            "link" => {
446                                let attrs = elem.attributes.borrow();
447                                if attrs.value("rel") == Some("stylesheet") {
448                                    if let Some(lnk) = attrs.value("href") {
449                                        let val = CSS_SUBSTS.get(lnk).map_or_else(
450                                            || lnk.to_string().into_boxed_str(),
451                                            |v| (*v).to_string().into_boxed_str(),
452                                        );
453                                        self.extractor.borrow_mut().css.push(CSS::Link(val));
454                                        node.delete();
455                                        return;
456                                    }
457                                }
458                            }
459                            "style" => {
460                                let str = node
461                                    .children()
462                                    .filter_map(|c| c.as_text().map(|s| s.borrow().to_string()))
463                                    .collect::<String>();
464                                // update: will get sorted / processed in bulk later
465                                self.extractor.borrow_mut().css.push(CSS::Inline(str.into()));//.extend(CSS::split(&str));
466                                node.delete();
467                                return;
468                            }
469                            _ => unreachable!(),
470                        }
471                    }
472                }
473            }
474        }
475        if let Some(mut elems) = elem.ftml.take() {
476            let mut extractor = self.extractor.borrow_mut();
477            elems.close(&mut *extractor, node);
478            if !elems.is_empty() {
479                elem.ftml.set(Some(elems));
480            }
481        }
482    }
483
484    #[inline]
485    fn append_before_sibling(&self, _sibling: &Self::Handle, _child: NodeOrText<Self::Handle>) {
486        unreachable!()
487        /*
488        match child {
489          NodeOrText::AppendNode(node) => sibling.insert_before(node),
490          NodeOrText::AppendText(text) => {
491              if let Some(previous_sibling) = sibling.previous_sibling() {
492                  if let Some(existing) = previous_sibling.as_text() {
493                      existing.borrow_mut().extend(text.chars());
494                      return;
495                  }
496              }
497              sibling.insert_before(NodeRef::new_text(text));
498          }
499        }
500         */
501    }
502
503    #[inline]
504    fn remove_from_parent(&self, _target: &Self::Handle) {
505        unreachable!()
506    }
507    #[inline]
508    fn reparent_children(&self, _node: &Self::Handle, _new_parent: &Self::Handle) {
509        unreachable!()
510    }
511    #[inline]
512    fn mark_script_already_started(&self, _node: &Self::Handle) {
513        unreachable!()
514    }
515    fn get_template_contents(&self, _target: &Self::Handle) -> Self::Handle {
516        unreachable!()
517    }
518    #[inline]
519    fn add_attrs_if_missing(&self, _target: &Self::Handle, _attrs: Vec<html5ever::Attribute>) {
520        unreachable!()
521    }
522}
523
524static CSS_SUBSTS: phf::Map<&'static str, &'static str> = phf::phf_map! {
525  "https://raw.githack.com/Jazzpirate/RusTeX/main/rustex/src/resources/rustex.css"
526  => "srv:/rustex.css"
527};
528
529fn update_attributes(elements: &FTMLElements, child: &ElementData) {
530    let mut attrs = child.attributes.borrow_mut();
531    for e in &elements.elems {
532        match e {
533            OpenFTMLElement::ImportModule(uri) => attrs.update(FTMLTag::ImportModule, uri),
534            OpenFTMLElement::UseModule(uri) => attrs.update(FTMLTag::UseModule, uri),
535            OpenFTMLElement::MathStructure { uri, .. } => {
536                attrs.update(FTMLTag::MathStructure, &uri.clone().into_module());
537            }
538            OpenFTMLElement::Morphism { uri, domain, .. } => {
539                attrs.update(FTMLTag::MorphismDomain, domain);
540                attrs.update(FTMLTag::Morphism, &uri.clone().into_module());
541            }
542            OpenFTMLElement::Assign(uri) => {
543                attrs.update(FTMLTag::Assign, uri);
544            }
545            // Paragraphs: fors-list
546            OpenFTMLElement::Symdecl { uri, .. } => {
547                attrs.update(FTMLTag::Symdecl, uri);
548            }
549            OpenFTMLElement::Notation {
550                symbol: VarOrSym::S(uri),
551                ..
552            } => {
553                attrs.update(FTMLTag::Notation, uri);
554            }
555            OpenFTMLElement::Definiendum(uri) => {
556                attrs.update(FTMLTag::Definiendum, uri);
557            }
558            OpenFTMLElement::Conclusion { uri, .. } => {
559                attrs.update(FTMLTag::Conclusion, uri);
560            }
561            OpenFTMLElement::Definiens { uri: Some(uri), .. } => {
562                attrs.update(FTMLTag::Definiens, uri);
563            }
564            OpenFTMLElement::Inputref { uri, .. } => {
565                attrs.update(FTMLTag::InputRef, uri);
566            }
567            OpenFTMLElement::OpenTerm {
568                term:
569                    OpenTerm::Symref { uri, .. }
570                    | OpenTerm::OMA {
571                        head: VarOrSym::S(uri),
572                        ..
573                    }
574                    | OpenTerm::Complex(VarOrSym::S(uri), ..),
575                ..
576            } => attrs.update(FTMLTag::Head, uri),
577            _ => (),
578        }
579    }
580    drop(attrs);
581    NodeRef::update_len(child);
582}
583
584#[allow(clippy::cast_sign_loss)]
585#[allow(clippy::cast_possible_wrap)]
586fn prolong(parent: &NodeRef, len: isize) {
587    if let Some(elem) = parent.as_element() {
588        let end = elem.end_offset.get();
589        elem.end_offset.set(((end as isize) + len) as usize);
590        if let Some(p) = parent.parent() {
591            prolong(&p, len);
592        }
593    }
594}