1use crate::textify::textify;
2#[cfg(feature = "vectorsearch")]
3use flams_backend_types::search::Embedding;
4#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
5use flams_backend_types::search::SearchResult;
6use flams_backend_types::search::SearchResultKind;
7use ftml_ontology::{
8 narrative::{
9 documents::Document,
10 elements::{DocumentElementRef, LogicalParagraph},
11 },
12 utils::RefTree,
13};
14use ftml_uris::{DocumentElementUri, DocumentUri, SymbolUri};
15
16#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
17#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
18pub enum SearchIndex {
19 Document {
20 uri: DocumentUri,
21 title: Option<String>,
22 body: String,
23 },
24 Paragraph {
25 uri: DocumentElementUri,
26 kind: SearchResultKind,
27 definition_like: bool,
28 title: Option<String>,
29 fors: Vec<SymbolUri>,
30 body: String,
31 },
32}
33
34#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
35impl SearchIndex {
36 pub(crate) fn from_document(doc: tantivy::TantivyDocument) -> Option<SearchResult> {
37 use tantivy::schema::Value;
38
39 let schema = crate::schema::SearchSchema::get();
40 let kind = doc.get_first(schema.kind)?.as_u64()?.try_into().ok()?;
41 Some(match kind {
42 SearchResultKind::Document => {
43 SearchResult::Document(doc.get_first(schema.uri_str)?.as_str()?.parse().ok()?)
44 }
45 _ => {
46 let uri = doc.get_first(schema.uri_str)?.as_str()?.parse().ok()?;
47 let def_like = doc.get_first(schema.def_like)?.as_bool()?;
48 let fors = doc
49 .get_all(schema.fors)
50 .flat_map(|v| v.as_str().and_then(|s| s.parse().ok()))
51 .collect::<Vec<_>>();
52 SearchResult::Paragraph {
53 uri,
54 fors,
55 def_like,
56 kind,
57 }
58 }
59 })
60 }
61 pub(crate) fn to_document(self) -> tantivy::TantivyDocument {
62 let mut ret = tantivy::TantivyDocument::default();
63 let schema = crate::schema::SearchSchema::get();
64 match self {
65 Self::Document { uri, title, body } => {
66 ret.add_u64(schema.kind, SearchResultKind::Document.into());
67 let uri = uri.to_string();
68 ret.add_bytes(schema.uri, uri.as_bytes());
69 ret.add_text(schema.uri_str, uri);
70 if let Some(t) = title {
71 ret.add_text(schema.title, t);
72 }
73 ret.add_text(schema.body, body);
74 }
75 Self::Paragraph {
76 uri,
77 kind,
78 definition_like,
79 title,
80 fors,
81 body,
82 } => {
83 ret.add_u64(schema.kind, kind.into());
84 let uri = uri.to_string();
85 ret.add_bytes(schema.uri, uri.as_bytes());
86 ret.add_text(schema.uri_str, uri);
87 ret.add_bool(schema.def_like, definition_like);
88 for f in fors {
89 ret.add_text(schema.fors, f.to_string());
91 }
92 if let Some(t) = title {
93 ret.add_text(schema.title, t);
94 }
95 ret.add_text(schema.body, body);
96 }
97 }
98 ret
99 }
100}
101
102#[cfg(feature = "vectorsearch")]
103#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
104pub enum SearchIndex {
105 Document {
106 uri: DocumentUri,
107 title: Option<Embedding>,
108 body: Embedding,
109 },
110 Paragraph {
111 uri: DocumentElementUri,
112 kind: SearchResultKind,
113 definition_like: bool,
114 title: Option<Embedding>,
115 fors: Vec<SymbolUri>,
116 body: Embedding,
117 },
118}
119
120#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
121pub fn index_document(doc: &Document, html: &str) -> Vec<SearchIndex> {
122 let elems = doc.dfs().filter_map(|e| {
123 if let DocumentElementRef::Paragraph(p) = e {
124 index_paragraph(p, html)
125 } else {
126 None
127 }
128 });
129 if let Some(s) = index_document_html(doc, html) {
130 std::iter::once(s).chain(elems).collect()
131 } else {
132 elems.collect()
133 }
134}
135
136#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
137#[must_use]
138pub fn index_document_html(doc: &Document, html: &str) -> Option<SearchIndex> {
139 let title = doc.title.as_ref().map(|s| textify(s, true));
140 let body = textify(html, false);
141 Some(SearchIndex::Document {
142 uri: doc.uri.clone(),
143 title,
144 body,
145 })
146}
147
148#[cfg(all(feature = "tantivy", not(feature = "vectorsearch")))]
149pub fn index_paragraph(para: &LogicalParagraph, html: &str) -> Option<SearchIndex> {
150 crate::SPAN.in_scope(move || {
151 let title = para.title.as_ref().map(|s| textify(s, true));
152 let Some(body) = html.get(para.range.start..para.range.end) else {
153 tracing::error!(
154 "Failed to plain textify body of {}: Error getting HTML range in document",
155 para.uri
156 );
157 return None;
158 };
159 let body = textify(body, true);
160 let fors = para.fors.iter().map(|(f, _)| f.clone()).collect();
161
162 let Ok(kind) = para.kind.try_into() else {
163 return None;
164 };
165 let definition_like = para.kind.is_definition_like(¶.styles);
166
167 Some(SearchIndex::Paragraph {
168 uri: para.uri.clone(),
169 kind,
170 definition_like,
171 title,
172 fors,
173 body,
174 })
175 })
176}
177
178#[cfg(feature = "vectorsearch")]
179#[must_use]
180pub fn index_document(doc: &Document, html: &str) -> Vec<SearchIndex> {
181 use flams_backend_types::search::Embedding;
182
183 let mut indexes = vec![SearchIndex::Document {
184 uri: doc.uri.clone(),
185 title: None,
186 body: Embedding::zero(),
187 }];
188 let txt = textify(html, false);
189 if txt.is_empty() {
190 return Vec::new();
191 }
192 let mut texts = vec![txt];
193 if let Some(ttl) = doc.title.as_ref() {
194 let SearchIndex::Document { title, .. } = &mut indexes[0] else {
195 unreachable!()
196 };
197 let txt = textify(ttl, true);
198 if !txt.is_empty() {
199 *title = Some(Embedding::zero());
200 texts.push(txt);
201 }
202 }
203
204 for e in doc.dfs() {
205 if let DocumentElementRef::Paragraph(para) = e
206 && let Some(body) = html.get(para.range.start..para.range.end)
207 && let Ok(kind) = para.kind.try_into()
208 {
209 let mut txt = textify(body, false);
210 if txt.is_empty() {
211 continue;
212 }
213 if !para.fors.is_empty() {
214 txt.push_str("\nKEYWORDS: ");
215 let mut first = true;
216 for (uri, _) in ¶.fors {
217 if !first {
218 txt.push_str(", ");
219 }
220 txt.push_str(uri.name().as_ref());
221 first = false;
222 }
223 }
224 texts.push(txt);
225 let title = para.title.as_ref().and_then(|ttl| {
226 let txt = textify(ttl, true);
227 if txt.is_empty() {
228 None
229 } else {
230 texts.push(txt);
231 Some(Embedding::zero())
232 }
233 });
234
235 indexes.push(SearchIndex::Paragraph {
236 uri: para.uri.clone(),
237 kind,
238 definition_like: para.kind.is_definition_like(¶.styles),
239 title,
240 fors: para.fors.iter().map(|(u, _)| u).cloned().collect(),
241 body: Embedding::zero(),
242 });
243 }
244 }
245 let Ok(results) = crate::Embedder::embed(&texts) else {
246 todo!()
247 };
248 drop(texts);
249 let mut results = results.into_iter();
250 let mut idx_iter = indexes.iter_mut();
251 let Some(SearchIndex::Document { title, body, .. }) = idx_iter.next() else {
252 unsafe {
254 use std::hint::unreachable_unchecked;
255 unreachable_unchecked()
256 }
257 };
258 *body = unsafe { results.next().unwrap_unchecked() };
260 if title.is_some() {
261 *title = Some(unsafe { results.next().unwrap_unchecked() });
263 }
264 for index in idx_iter {
265 let SearchIndex::Paragraph { title, body, .. } = index else {
266 unsafe {
268 use std::hint::unreachable_unchecked;
269 unreachable_unchecked()
270 }
271 };
272 *body = unsafe { results.next().unwrap_unchecked() };
274 if title.is_some() {
275 *title = Some(unsafe { results.next().unwrap_unchecked() });
277 }
278 }
279 indexes
280}