Skip to main content

flams_search/textify/
ever.rs

1use html5ever::{
2    LocalName, Namespace, QualName,
3    interface::QuirksMode,
4    serialize::{SerializeOpts, TraversalScope},
5    tendril::StrTendril,
6};
7use std::{
8    borrow::Cow,
9    cell::{Cell, RefCell},
10    rc::{Rc, Weak},
11};
12
13/// A node inside a DOM-like tree.
14pub struct Node {
15    parent: Cell<Option<Weak<Node>>>,
16    previous_sibling: Cell<Option<Weak<Node>>>,
17    next_sibling: Cell<Option<Rc<Node>>>,
18    first_child: Cell<Option<Rc<Node>>>,
19    last_child: Cell<Option<Weak<Node>>>,
20    data: NodeData,
21}
22impl std::fmt::Debug for Node {
23    #[inline]
24    #[allow(clippy::ref_as_ptr)]
25    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
26        write!(f, "{:?} @ {:?}", self.data, self as *const Self)
27    }
28}
29
30#[derive(Clone, Debug)]
31pub struct NodeRef(pub Rc<Node>);
32
33#[derive(Debug, PartialEq, Eq, Clone)]
34pub struct Attributes(pub Vec<(QualName, StrTendril)>);
35
36/// Node data specific to the node type.
37#[derive(Debug)]
38pub enum NodeData {
39    /// Element node
40    Element(ElementData),
41
42    /// Text node
43    Text(RefCell<StrTendril>),
44
45    /// Comment node
46    Comment(StrTendril),
47
48    /// Processing instruction node
49    ProcessingInstruction(StrTendril, StrTendril),
50
51    /// Doctype node
52    Doctype {
53        name: StrTendril,
54        public_id: StrTendril,
55        system_id: StrTendril,
56    },
57
58    /// Document node
59    Document(Cell<QuirksMode>),
60}
61
62/// Data specific to element nodes.
63pub struct ElementData {
64    /// The namespace and local name of the element, such as `ns!(html)` and `body`.
65    pub name: QualName,
66    /// The attributes of the elements.
67    pub attributes: RefCell<Attributes>,
68    pub start_offset: Cell<usize>,
69    pub end_offset: Cell<usize>,
70    pub closed: Cell<bool>,
71}
72impl std::fmt::Debug for ElementData {
73    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74        write!(f, "{:?}[{:?}]", self.name, self.attributes)
75    }
76}
77
78impl Attributes {
79    pub fn len(&self) -> usize {
80        self.0
81            .iter()
82            .map(|(k, v)| {
83                k.prefix
84                    .as_ref()
85                    .map(|e| e.len() + ":".len())
86                    .unwrap_or_default()
87                    + k.local.len()
88                    + " =\"\"".len()
89                    + escaped_len(v, true)
90            })
91            .sum()
92    }
93
94    pub(crate) fn new_attr(&mut self, key: &str, value: String) {
95        let name = QualName::new(None, Namespace::from(""), LocalName::from(key.to_string()));
96        for (k, v) in &mut self.0 {
97            if *k == name {
98                *v = value.into();
99                return;
100            }
101        }
102        self.0.push((name, value.into()));
103    }
104}
105impl From<Vec<html5ever::Attribute>> for Attributes {
106    fn from(value: Vec<html5ever::Attribute>) -> Self {
107        Self(
108            value
109                .into_iter()
110                .map(|html5ever::Attribute { name, value }| (name, value))
111                .collect(),
112        )
113    }
114}
115
116impl NodeRef {
117    pub fn string(&self) -> Cow<'_, str> {
118        let mut html = Vec::new();
119        let _ = html5ever::serialize(
120            &mut html,
121            self,
122            SerializeOpts {
123                traversal_scope: TraversalScope::IncludeNode,
124                ..Default::default()
125            },
126        );
127        String::from_utf8_lossy(&html).into_owned().into() //from_utf8_lossy_owned(html)
128    }
129
130    #[allow(clippy::cast_sign_loss)]
131    #[allow(clippy::cast_possible_wrap)]
132    fn len_update(&self, len: isize) {
133        if let Some(e) = self.as_element() {
134            //assert!((e.end_offset.get() as isize + len) >= e.start_offset.get() as isize);
135            e.end_offset
136                .set(((e.end_offset.get() as isize) + len) as usize);
137        }
138        let mut cur = self.clone();
139        while let Some(n) = cur.next_sibling() {
140            if let Some(e) = n.as_element() {
141                //assert!((e.end_offset.get() as isize + len) >= e.start_offset.get() as isize);
142                e.start_offset
143                    .set(((e.start_offset.get() as isize) + len) as usize);
144                e.end_offset
145                    .set(((e.end_offset.get() as isize) + len) as usize);
146            }
147            cur = n;
148        }
149        if let Some(p) = self.parent() {
150            p.len_update(len);
151        }
152    }
153
154    #[inline]
155    pub fn children(&self) -> Siblings {
156        match (self.first_child(), self.last_child()) {
157            (Some(first_child), Some(last_child)) => Siblings(Some(State {
158                next: first_child,
159                next_back: last_child,
160            })),
161            (None, None) => Siblings(None),
162            _ => unreachable!(),
163        }
164    }
165    pub fn len(&self) -> usize {
166        match &self.data {
167            NodeData::Comment(_) => 0, // c.as_bytes().len() + "<!---->".len(),
168            NodeData::Text(t) => t.borrow().as_bytes().len(),
169            NodeData::Element(e) => e.end_offset.get() - e.start_offset.get(),
170            NodeData::Doctype { name, .. } => "<!DOCTYPE >".len() + name.as_bytes().len(),
171            NodeData::ProcessingInstruction(target, data) => {
172                "<? >".len() + target.as_bytes().len() + data.as_bytes().len()
173            }
174            NodeData::Document(_) => self.children().map(|c| c.len()).sum(),
175        }
176    }
177
178    /// Create a new node.
179    #[inline]
180    pub fn new(data: NodeData) -> Self {
181        Self(Rc::new(Node {
182            parent: Cell::new(None),
183            first_child: Cell::new(None),
184            last_child: Cell::new(None),
185            previous_sibling: Cell::new(None),
186            next_sibling: Cell::new(None),
187            data,
188        }))
189    }
190
191    pub fn update_len(e: &ElementData) {
192        let len = Self::base_len(&e.name) + e.attributes.borrow().len();
193        e.end_offset.set(e.start_offset.get() + len);
194    }
195
196    fn self_closing(name: &QualName) -> bool {
197        use html5ever::{local_name, ns};
198        name.ns == ns!(html)
199            && matches!(
200                name.local,
201                local_name!("area")
202                    | local_name!("base")
203                    | local_name!("basefont")
204                    | local_name!("bgsound")
205                    | local_name!("br")
206                    | local_name!("col")
207                    | local_name!("embed")
208                    | local_name!("frame")
209                    | local_name!("hr")
210                    | local_name!("img")
211                    | local_name!("input")
212                    | local_name!("keygen")
213                    | local_name!("link")
214                    | local_name!("meta")
215                    | local_name!("param")
216                    | local_name!("source")
217                    | local_name!("track")
218                    | local_name!("wbr")
219            )
220    }
221
222    fn base_len(name: &QualName) -> usize {
223        let tag_len = tag_len(name);
224        if Self::self_closing(name) {
225            tag_len
226        } else {
227            2 * tag_len + 1
228        }
229    }
230
231    /// Create a new element node.
232    #[inline]
233    pub fn new_element(name: QualName, attributes: Attributes) -> Self {
234        let attrs_len: usize = attributes.len();
235        let base_len = Self::base_len(&name);
236        Self::new(NodeData::Element(ElementData {
237            name,
238            attributes: RefCell::new(attributes),
239            start_offset: Cell::new(0),
240            end_offset: Cell::new(base_len + attrs_len),
241            closed: Cell::new(false),
242        }))
243    }
244
245    /// Create a new text node.
246    #[inline]
247    pub fn new_text(value: StrTendril) -> Self {
248        Self::new(NodeData::Text(RefCell::new(value)))
249    }
250
251    /// Create a new comment node.
252    #[inline]
253    pub fn new_comment(value: StrTendril) -> Self {
254        Self::new(NodeData::Comment(value))
255    }
256
257    /// Create a new processing instruction node.
258    #[inline]
259    pub fn new_processing_instruction(target: StrTendril, data: StrTendril) -> Self {
260        Self::new(NodeData::ProcessingInstruction(target, data))
261    }
262
263    /// Create a new doctype node.
264    #[inline]
265    pub fn new_doctype(name: StrTendril, public_id: StrTendril, system_id: StrTendril) -> Self {
266        Self::new(NodeData::Doctype {
267            name,
268            public_id,
269            system_id,
270        })
271    }
272
273    /// Create a new document node.
274    #[inline]
275    pub fn new_document() -> Self {
276        Self::new(NodeData::Document(Cell::new(QuirksMode::NoQuirks)))
277    }
278
279    /// Append a new child to this node, after existing children.
280    ///
281    /// The new child is detached from its previous position.
282    pub fn append(&self, new_child: Self) {
283        new_child.detach();
284        new_child.parent.replace(Some(Rc::downgrade(&self.0)));
285        if let Some(last_child_weak) = self.last_child.replace(Some(Rc::downgrade(&new_child.0)))
286            && let Some(last_child) = last_child_weak.upgrade()
287        {
288            new_child.previous_sibling.replace(Some(last_child_weak));
289            debug_assert!(last_child.next_sibling.is_none());
290            last_child.next_sibling.replace(Some(new_child.0));
291            return;
292        }
293        debug_assert!(self.first_child.is_none());
294        self.first_child.replace(Some(new_child.0));
295    }
296}
297
298impl Node {
299    /// Return a reference to this node’s node-type-specific data.
300    #[inline]
301    pub const fn data(&self) -> &NodeData {
302        &self.data
303    }
304
305    /// If this node is an element, return a reference to element-specific data.
306    #[inline]
307    pub const fn as_element(&self) -> Option<&ElementData> {
308        match self.data {
309            NodeData::Element(ref value) => Some(value),
310            _ => None,
311        }
312    }
313
314    /// If this node is a document, return a reference to element-specific data.
315    #[inline]
316    pub const fn as_document(&self) -> Option<&Cell<QuirksMode>> {
317        match self.data {
318            NodeData::Document(ref value) => Some(value),
319            _ => None,
320        }
321    }
322
323    /// If this node is a text node, return a reference to its contents.
324    #[inline]
325    pub const fn as_text(&self) -> Option<&RefCell<StrTendril>> {
326        match self.data {
327            NodeData::Text(ref value) => Some(value),
328            _ => None,
329        }
330    }
331
332    /// Return a reference to the parent node, unless this node is the root of the tree.
333    #[inline]
334    pub fn parent(&self) -> Option<NodeRef> {
335        self.parent.upgrade().map(NodeRef)
336    }
337
338    /// Return a reference to the first child of this node, unless it has no child.
339    #[inline]
340    pub fn first_child(&self) -> Option<NodeRef> {
341        self.first_child.clone_inner().map(NodeRef)
342    }
343
344    /// Return a reference to the last child of this node, unless it has no child.
345    #[inline]
346    pub fn last_child(&self) -> Option<NodeRef> {
347        self.last_child.upgrade().map(NodeRef)
348    }
349
350    /// Return a reference to the previous sibling of this node, unless it is a first child.
351    #[inline]
352    pub fn previous_sibling(&self) -> Option<NodeRef> {
353        self.previous_sibling.upgrade().map(NodeRef)
354    }
355
356    /// Return a reference to the next sibling of this node, unless it is a last child.
357    #[inline]
358    pub fn next_sibling(&self) -> Option<NodeRef> {
359        self.next_sibling.clone_inner().map(NodeRef)
360    }
361
362    /// Detach a node from its parent and siblings. Children are not affected.
363    ///
364    /// To remove a node and its descendants, detach it and drop any strong reference to it.
365    pub fn detach(&self) {
366        let parent_weak = self.parent.take();
367        let previous_sibling_weak = self.previous_sibling.take();
368        let next_sibling_strong = self.next_sibling.take();
369
370        let previous_sibling_opt = previous_sibling_weak.as_ref().and_then(Weak::upgrade);
371
372        if let Some(next_sibling_ref) = next_sibling_strong.as_ref() {
373            next_sibling_ref
374                .previous_sibling
375                .replace(previous_sibling_weak);
376        } else if let Some(parent_ref) = parent_weak.as_ref()
377            && let Some(parent_strong) = parent_ref.upgrade()
378        {
379            parent_strong.last_child.replace(previous_sibling_weak);
380        }
381
382        if let Some(previous_sibling_strong) = previous_sibling_opt {
383            previous_sibling_strong
384                .next_sibling
385                .replace(next_sibling_strong);
386        } else if let Some(parent_ref) = parent_weak.as_ref()
387            && let Some(parent_strong) = parent_ref.upgrade()
388        {
389            parent_strong.first_child.replace(next_sibling_strong);
390        }
391    }
392}
393
394impl std::ops::Deref for NodeRef {
395    type Target = Node;
396    #[inline]
397    fn deref(&self) -> &Node {
398        &self.0
399    }
400}
401
402impl Eq for NodeRef {}
403impl PartialEq for NodeRef {
404    #[inline]
405    fn eq(&self, other: &Self) -> bool {
406        let a: *const Node = &raw const *self.0;
407        let b: *const Node = &raw const *other.0;
408        a == b
409    }
410}
411
412impl Drop for Node {
413    fn drop(&mut self) {
414        fn non_recursive_drop_unique_rc(mut rc: Rc<Node>, stack: &mut Vec<Rc<Node>>) {
415            loop {
416                if let Some(child) = rc.first_child.take_if_unique_strong() {
417                    stack.push(rc);
418                    rc = child;
419                    continue;
420                }
421                if let Some(sibling) = rc.next_sibling.take_if_unique_strong() {
422                    // The previous value of `rc: Rc<Node>` is dropped here.
423                    // Since it was unique, the corresponding `Node` is dropped as well.
424                    // `<Node as Drop>::drop` does not call `drop_rc`
425                    // as both the first child and next sibling were already taken.
426                    // Weak reference counts decremented here for `Cell`s that are `Some`:
427                    // * `rc.parent`: still has a strong reference in `stack` or elsewhere
428                    // * `rc.last_child`: this is the last weak ref. Deallocated now.
429                    // * `rc.previous_sibling`: this is the last weak ref. Deallocated now.
430                    rc = sibling;
431                    continue;
432                }
433                if let Some(parent) = stack.pop() {
434                    // Same as in the above comment.
435                    rc = parent;
436                    continue;
437                }
438                return;
439            }
440        }
441        // `.take_if_unique_strong()` temporarily leaves the tree in an inconsistent state,
442        // as the corresponding `Weak` reference in the other direction is not removed.
443        // It is important that all `Some(_)` strong references it returns
444        // are dropped by the end of this `drop` call,
445        // and that no user code is invoked in-between.
446
447        // Sharing `stack` between these two calls is not necessary,
448        // but it allows re-using memory allocations.
449        let mut stack = Vec::new();
450        if let Some(rc) = self.first_child.take_if_unique_strong() {
451            non_recursive_drop_unique_rc(rc, &mut stack);
452        }
453        if let Some(rc) = self.next_sibling.take_if_unique_strong() {
454            non_recursive_drop_unique_rc(rc, &mut stack);
455        }
456    }
457}
458
459#[inline]
460pub fn tag_len(name: &QualName) -> usize {
461    name.prefix
462        .as_ref()
463        .map(|e| e.len() + 1 /* ':' */)
464        .unwrap_or_default()
465        + name.local.len()
466        + "<>".len()
467}
468
469pub fn escaped_len(str: &str, attr_mode: bool) -> usize {
470    str.chars()
471        .map(|b| match b {
472            '&' => "&amp;".len(),
473            '\u{00A0}' => "&nbsp;".len(),
474            '"' if attr_mode => "&quot;".len(),
475            '<' if !attr_mode => "&lt;".len(),
476            '>' if !attr_mode => "&gt;".len(),
477            c => c.len_utf8(),
478        })
479        .sum()
480}
481
482#[derive(Debug, Clone)]
483struct State<T> {
484    next: T,
485    next_back: T,
486}
487/// A double-ended iterator of sibling nodes.
488#[derive(Debug, Clone)]
489pub struct Siblings(Option<State<NodeRef>>);
490
491macro_rules! siblings_next {
492    ($next: ident, $next_back: ident, $next_sibling: ident) => {
493        fn $next(&mut self) -> Option<NodeRef> {
494            #![allow(non_shorthand_field_patterns)]
495            self.0.take().map(
496                |State {
497                     $next: next,
498                     $next_back: next_back,
499                 }| {
500                    if let Some(sibling) = next.$next_sibling() {
501                        if next != next_back {
502                            self.0 = Some(State {
503                                $next: sibling,
504                                $next_back: next_back,
505                            })
506                        }
507                    }
508                    next
509                },
510            )
511        }
512    };
513}
514
515impl Iterator for Siblings {
516    type Item = NodeRef;
517    siblings_next!(next, next_back, next_sibling);
518}
519
520impl DoubleEndedIterator for Siblings {
521    siblings_next!(next_back, next, previous_sibling);
522}
523
524impl html5ever::serialize::Serialize for NodeRef {
525    fn serialize<S>(
526        &self,
527        serializer: &mut S,
528        traversal_scope: TraversalScope,
529    ) -> std::io::Result<()>
530    where
531        S: html5ever::serialize::Serializer,
532    {
533        match (traversal_scope, self.data()) {
534            (ref scope, NodeData::Element(element)) => {
535                if *scope == TraversalScope::IncludeNode {
536                    let attrs = element.attributes.borrow();
537
538                    serializer.start_elem(
539                        element.name.clone(),
540                        attrs.0.iter().map(|(name, value)| (name, &**value)),
541                    )?;
542                }
543                let children = self.children();
544
545                for child in children {
546                    html5ever::serialize::Serialize::serialize(
547                        &child,
548                        serializer,
549                        TraversalScope::IncludeNode,
550                    )?;
551                }
552
553                if *scope == TraversalScope::IncludeNode {
554                    serializer.end_elem(element.name.clone())?;
555                }
556                Ok(())
557            }
558
559            (_, &NodeData::Document(_)) => {
560                for child in self.children() {
561                    html5ever::serialize::Serialize::serialize(
562                        &child,
563                        serializer,
564                        TraversalScope::IncludeNode,
565                    )?;
566                }
567                Ok(())
568            }
569
570            (TraversalScope::ChildrenOnly(_), _) => Ok(()),
571
572            (TraversalScope::IncludeNode, NodeData::Doctype { name, .. }) => {
573                serializer.write_doctype(name)
574            }
575            (TraversalScope::IncludeNode, NodeData::Text(text)) => {
576                serializer.write_text(&text.borrow())
577            }
578            (TraversalScope::IncludeNode, NodeData::Comment(_text)) => Ok(()), //serializer.write_comment(text),
579            (TraversalScope::IncludeNode, NodeData::ProcessingInstruction(target, data)) => {
580                serializer.write_processing_instruction(target, data)
581            }
582        }
583    }
584}
585
586trait CellOption {
587    fn is_none(&self) -> bool;
588}
589
590impl<T> CellOption for Cell<Option<T>> {
591    #[inline]
592    fn is_none(&self) -> bool {
593        unsafe { (*self.as_ptr()).is_none() }
594    }
595}
596
597trait CellOptionWeak<T> {
598    fn upgrade(&self) -> Option<Rc<T>>;
599}
600
601impl<T> CellOptionWeak<T> for Cell<Option<Weak<T>>> {
602    #[inline]
603    fn upgrade(&self) -> Option<Rc<T>> {
604        unsafe { (*self.as_ptr()).as_ref().and_then(Weak::upgrade) }
605    }
606}
607
608trait CellOptionRc<T> {
609    /// Return `Some` if this `Rc` is the only strong reference count,
610    /// even if there are weak references.
611    fn take_if_unique_strong(&self) -> Option<Rc<T>>;
612    fn clone_inner(&self) -> Option<Rc<T>>;
613}
614
615impl<T> CellOptionRc<T> for Cell<Option<Rc<T>>> {
616    #[inline]
617    fn take_if_unique_strong(&self) -> Option<Rc<T>> {
618        unsafe {
619            match *self.as_ptr() {
620                None => None,
621                Some(ref rc) if Rc::strong_count(rc) > 1 => None,
622                // Not borrowing the `Rc<T>` here
623                // as we would be invalidating that borrow while it is outstanding:
624                Some(_) => self.take(),
625            }
626        }
627    }
628
629    #[inline]
630    fn clone_inner(&self) -> Option<Rc<T>> {
631        unsafe { (*self.as_ptr()).clone() }
632    }
633}