Skip to main content

flams_search/
index.rs

1use crate::textify::textify;
2#[cfg(feature = "vectorsearch")]
3use flams_backend_types::search::Embedding;
4#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
5use flams_backend_types::search::SearchResult;
6use flams_backend_types::search::SearchResultKind;
7use ftml_ontology::{
8    narrative::{
9        documents::Document,
10        elements::{DocumentElementRef, LogicalParagraph},
11    },
12    utils::RefTree,
13};
14use ftml_uris::{DocumentElementUri, DocumentUri, SymbolUri};
15
16#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
17#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
18pub enum SearchIndex {
19    Document {
20        uri: DocumentUri,
21        title: Option<String>,
22        body: String,
23    },
24    Paragraph {
25        uri: DocumentElementUri,
26        kind: SearchResultKind,
27        definition_like: bool,
28        title: Option<String>,
29        fors: Vec<SymbolUri>,
30        body: String,
31    },
32}
33
34#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
35impl SearchIndex {
36    pub(crate) fn from_document(doc: tantivy::TantivyDocument) -> Option<SearchResult> {
37        use tantivy::schema::Value;
38
39        let schema = crate::schema::SearchSchema::get();
40        let kind = doc.get_first(schema.kind)?.as_u64()?.try_into().ok()?;
41        Some(match kind {
42            SearchResultKind::Document => {
43                SearchResult::Document(doc.get_first(schema.uri_str)?.as_str()?.parse().ok()?)
44            }
45            _ => {
46                let uri = doc.get_first(schema.uri_str)?.as_str()?.parse().ok()?;
47                let def_like = doc.get_first(schema.def_like)?.as_bool()?;
48                let fors = doc
49                    .get_all(schema.fors)
50                    .flat_map(|v| v.as_str().and_then(|s| s.parse().ok()))
51                    .collect::<Vec<_>>();
52                SearchResult::Paragraph {
53                    uri,
54                    fors,
55                    def_like,
56                    kind,
57                }
58            }
59        })
60    }
61    pub(crate) fn to_document(self) -> tantivy::TantivyDocument {
62        let mut ret = tantivy::TantivyDocument::default();
63        let schema = crate::schema::SearchSchema::get();
64        match self {
65            Self::Document { uri, title, body } => {
66                ret.add_u64(schema.kind, SearchResultKind::Document.into());
67                let uri = uri.to_string();
68                ret.add_bytes(schema.uri, uri.as_bytes());
69                ret.add_text(schema.uri_str, uri);
70                if let Some(t) = title {
71                    ret.add_text(schema.title, t);
72                }
73                ret.add_text(schema.body, body);
74            }
75            Self::Paragraph {
76                uri,
77                kind,
78                definition_like,
79                title,
80                fors,
81                body,
82            } => {
83                ret.add_u64(schema.kind, kind.into());
84                let uri = uri.to_string();
85                ret.add_bytes(schema.uri, uri.as_bytes());
86                ret.add_text(schema.uri_str, uri);
87                ret.add_bool(schema.def_like, definition_like);
88                for f in fors {
89                    //write!(trace,"\n   FOR: {}",f);
90                    ret.add_text(schema.fors, f.to_string());
91                }
92                if let Some(t) = title {
93                    ret.add_text(schema.title, t);
94                }
95                ret.add_text(schema.body, body);
96            }
97        }
98        ret
99    }
100}
101
102#[cfg(feature = "vectorsearch")]
103#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
104pub enum SearchIndex {
105    Document {
106        uri: DocumentUri,
107        title: Option<Embedding>,
108        body: Embedding,
109    },
110    Paragraph {
111        uri: DocumentElementUri,
112        kind: SearchResultKind,
113        definition_like: bool,
114        title: Option<Embedding>,
115        fors: Vec<SymbolUri>,
116        body: Embedding,
117    },
118}
119
120#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
121pub fn index_document(doc: &Document, html: &str) -> Vec<SearchIndex> {
122    let elems = doc.dfs().filter_map(|e| {
123        if let DocumentElementRef::Paragraph(p) = e {
124            index_paragraph(p, html)
125        } else {
126            None
127        }
128    });
129    if let Some(s) = index_document_html(doc, html) {
130        std::iter::once(s).chain(elems).collect()
131    } else {
132        elems.collect()
133    }
134}
135
136#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
137#[must_use]
138pub fn index_document_html(doc: &Document, html: &str) -> Option<SearchIndex> {
139    let title = doc.title.as_ref().map(|s| textify(s, true));
140    let body = textify(html, false);
141    Some(SearchIndex::Document {
142        uri: doc.uri.clone(),
143        title,
144        body,
145    })
146}
147
148#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
149pub fn index_paragraph(para: &LogicalParagraph, html: &str) -> Option<SearchIndex> {
150    crate::SPAN.in_scope(move || {
151        let title = para.title.as_ref().map(|s| textify(s, true));
152        let Some(body) = html.get(para.range.start..para.range.end) else {
153            tracing::error!(
154                "Failed to plain textify body of {}: Error getting HTML range in document",
155                para.uri
156            );
157            return None;
158        };
159        let body = textify(body, true);
160        let fors = para.fors.iter().map(|(f, _)| f.clone()).collect();
161
162        let Ok(kind) = para.kind.try_into() else {
163            return None;
164        };
165        let definition_like = para.kind.is_definition_like(&para.styles);
166
167        Some(SearchIndex::Paragraph {
168            uri: para.uri.clone(),
169            kind,
170            definition_like,
171            title,
172            fors,
173            body,
174        })
175    })
176}
177
178#[cfg(feature = "vectorsearch")]
179#[must_use]
180pub fn index_document(doc: &Document, html: &str) -> Vec<SearchIndex> {
181    use flams_backend_types::search::Embedding;
182
183    let mut indexes = vec![SearchIndex::Document {
184        uri: doc.uri.clone(),
185        title: None,
186        body: Embedding::zero(),
187    }];
188    let txt = textify(html, false);
189    if txt.is_empty() {
190        return Vec::new();
191    }
192    let mut texts = vec![txt];
193    if let Some(ttl) = doc.title.as_ref() {
194        let SearchIndex::Document { title, .. } = &mut indexes[0] else {
195            unreachable!()
196        };
197        let txt = textify(ttl, true);
198        if !txt.is_empty() {
199            *title = Some(Embedding::zero());
200            texts.push(txt);
201        }
202    }
203
204    for e in doc.dfs() {
205        if let DocumentElementRef::Paragraph(para) = e
206            && let Some(body) = html.get(para.range.start..para.range.end)
207            && let Ok(kind) = para.kind.try_into()
208        {
209            let mut txt = textify(body, false);
210            if txt.is_empty() {
211                continue;
212            }
213            if !para.fors.is_empty() {
214                txt.push_str("\nKEYWORDS: ");
215                let mut first = true;
216                for (uri, _) in &para.fors {
217                    if !first {
218                        txt.push_str(", ");
219                    }
220                    txt.push_str(uri.name().as_ref());
221                    first = false;
222                }
223            }
224            texts.push(txt);
225            let title = para.title.as_ref().and_then(|ttl| {
226                let txt = textify(ttl, true);
227                if txt.is_empty() {
228                    None
229                } else {
230                    texts.push(txt);
231                    Some(Embedding::zero())
232                }
233            });
234
235            indexes.push(SearchIndex::Paragraph {
236                uri: para.uri.clone(),
237                kind,
238                definition_like: para.kind.is_definition_like(&para.styles),
239                title,
240                fors: para.fors.iter().map(|(u, _)| u).cloned().collect(),
241                body: Embedding::zero(),
242            });
243        }
244    }
245    let Ok(results) = crate::Embedder::embed(&texts) else {
246        todo!()
247    };
248    drop(texts);
249    let mut results = results.into_iter();
250    let mut idx_iter = indexes.iter_mut();
251    let Some(SearchIndex::Document { title, body, .. }) = idx_iter.next() else {
252        // SAFETY: we know that it contains at least once document at the start
253        unsafe {
254            use std::hint::unreachable_unchecked;
255            unreachable_unchecked()
256        }
257    };
258    // SAFETY: results.len() == texts.len()
259    *body = unsafe { results.next().unwrap_unchecked() };
260    if title.is_some() {
261        // SAFETY: result.len() == texts.len() && title.is_some() iff there is a title in texts
262        *title = Some(unsafe { results.next().unwrap_unchecked() });
263    }
264    for index in idx_iter {
265        let SearchIndex::Paragraph { title, body, .. } = index else {
266            // SAFETY: only the first element is a Document
267            unsafe {
268                use std::hint::unreachable_unchecked;
269                unreachable_unchecked()
270            }
271        };
272        // SAFETY: results.len() == texts.len()
273        *body = unsafe { results.next().unwrap_unchecked() };
274        if title.is_some() {
275            // SAFETY: result.len() == texts.len() && title.is_some() iff there is a title in texts
276            *title = Some(unsafe { results.next().unwrap_unchecked() });
277        }
278    }
279    indexes
280}