flams_search/
lib.rs

1#![allow(unexpected_cfgs)]
2#![cfg_attr(all(doc, CHANNEL_NIGHTLY), feature(doc_cfg))]
3#![doc = include_str!("../README.md")]
4/*!
5 * ## Feature flags
6 */
7#![cfg_attr(doc,doc = document_features::document_features!())]
8
9use flams_backend_types::search::{QueryFilter, SearchIndex, SearchResult};
10use flams_math_archives::{
11    Archive, LocallyBuilt,
12    artifacts::{Artifact, ContentResult, FileOrString},
13    backend::{AnyBackend, GlobalBackend, LocalBackend},
14    build_target,
15    formats::BuildResult,
16    utils::errors::{ArtifactSaveError, FileError},
17};
18use flams_system::FlamsExtension;
19use ftml_uris::{DocumentUri, SymbolUri, UriPath, UriWithArchive};
20
21use crate::{index::SearchIndexExt, schema::SearchSchema};
22
23pub mod index;
24pub mod query;
25pub mod schema;
26
27flams_system::register_exension!(FlamsExtension {
28    name: "tantivy_search",
29    on_start: initialize,
30    on_build_result: |b, uri, rel_path, a| if let Some(content) =
31        a.as_any().downcast_ref::<ContentResult>()
32    {
33        index(b, uri, rel_path, content);
34    }
35});
36
37build_target!(TANTIVY {
38    name: "tantivy_search",
39    description: "search index",
40    run: |_| BuildResult::default()
41});
42
43const MEMORY_SIZE: usize = 50_000_000;
44static SEARCHER: std::sync::LazyLock<Searcher> = std::sync::LazyLock::new(Searcher::new);
45static SPAN: std::sync::LazyLock<tracing::Span> =
46    std::sync::LazyLock::new(|| tracing::info_span!(target:"search",parent:None,"search"));
47
48pub struct Searcher {
49    index: parking_lot::RwLock<tantivy::index::Index>,
50    reader: parking_lot::RwLock<tantivy::IndexReader>,
51    writer: parking_lot::Mutex<()>,
52}
53impl Searcher {
54    #[inline]
55    #[must_use]
56    pub fn get() -> &'static Self {
57        &SEARCHER
58    }
59
60    fn new() -> Self {
61        let index =
62            tantivy::index::Index::create_in_ram(schema::SearchSchema::get().schema.clone());
63        Self {
64            reader: parking_lot::RwLock::new(index.reader().expect("Failed to build reader")),
65            index: parking_lot::RwLock::new(index),
66            writer: parking_lot::Mutex::new(()),
67        }
68    }
69
70    pub fn query(
71        &self,
72        s: &str,
73        opts: QueryFilter,
74        num_results: usize,
75    ) -> Option<Vec<(f32, SearchResult)>> {
76        SPAN.in_scope(move || {
77            let searcher = self.reader.read().searcher();
78            let query = query::build_query(s, &self.index.read(), opts)?;
79            let top_num = if num_results == 0 {
80                usize::MAX / 2
81            } else {
82                num_results
83            };
84            let mut ret = Vec::new();
85            for (s, a) in searcher
86                .search(&*query, &tantivy::collector::TopDocs::with_limit(top_num))
87                .ok()?
88            {
89                let query::Wrapper(r) = searcher.doc(a).ok()?;
90                ret.push((s, r));
91            }
92            Some(ret)
93        })
94    }
95
96    #[allow(clippy::type_complexity)]
97    pub fn query_symbols(
98        &self,
99        s: &str,
100        num_results: usize,
101    ) -> Option<Vec<(SymbolUri, Vec<(f32, SearchResult)>)>> {
102        SPAN.in_scope(move || {
103            const FILTER: QueryFilter = QueryFilter {
104                allow_documents: false,
105                allow_paragraphs: true,
106                allow_definitions: true,
107                allow_examples: false,
108                allow_assertions: true,
109                allow_problems: false,
110                definition_like_only: true,
111            };
112            let searcher = self.reader.read().searcher();
113
114            let query = query::build_query(s, &self.index.read(), FILTER)?;
115            let top_num = if num_results == 0 {
116                usize::MAX / 2
117            } else {
118                num_results
119            };
120            let mut ret: Vec<(SymbolUri, Vec<(f32, SearchResult)>)> = Vec::new();
121            for (s, a) in searcher
122                .search(
123                    &*query,
124                    &tantivy::collector::TopDocs::with_limit(top_num * 2),
125                )
126                .ok()?
127            {
128                let query::Wrapper(r): query::Wrapper<SearchResult> = searcher.doc(a).ok()?;
129                if let SearchResult::Paragraph { fors, .. } = &r {
130                    for sym in fors {
131                        if let Some(v) = ret
132                            .iter_mut()
133                            .find_map(|(k, v)| if *k == *sym { Some(v) } else { None })
134                        {
135                            v.push((s, r.clone()));
136                        } else {
137                            ret.push((sym.clone(), vec![(s, r.clone())]));
138                        }
139                    }
140                }
141            }
142            if ret.len() > num_results {
143                let _ = ret.split_off(num_results);
144            }
145            Some(ret)
146        })
147    }
148}
149
150fn index(backend: &AnyBackend, uri: &DocumentUri, rel_path: &UriPath, result: &ContentResult) {
151    backend.with_buildable_archive(uri.archive_id(), |a| {
152        if let Some(a) = a {
153            let it: Vec<_> = index::index_document(&result.document, &result.ftml).collect();
154            let _ = a.save(
155                uri,
156                Some(rel_path),
157                FileOrString::Str(String::new().into_boxed_str()),
158                TANTIVY.id(),
159                Some(Box::new(IndexFile(it)) as _),
160                GlobalBackend.triple_store(),
161                false,
162            );
163        }
164    });
165}
166
167struct IndexFile(Vec<SearchIndex>);
168impl Artifact for IndexFile {
169    fn as_any(&self) -> &dyn std::any::Any {
170        self as _
171    }
172    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
173        self as _
174    }
175    fn kind(&self) -> &'static str {
176        "tantivy"
177    }
178    fn write(&self, into: &std::path::Path) -> Result<(), ArtifactSaveError> {
179        let file = std::fs::File::create(into)
180            .map_err(|e| ArtifactSaveError::Fs(FileError::Creation(into.to_path_buf(), e)))?;
181        bincode::serde::encode_into_std_write(
182            &self.0,
183            &mut std::io::BufWriter::new(file),
184            bincode::config::standard(),
185        )?;
186        Ok(())
187    }
188}
189
190fn initialize() {
191    SPAN.in_scope(|| {
192        use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
193        let index = tantivy::index::Index::create_in_ram(SearchSchema::get().schema.clone());
194        let mut writer = index
195            .writer(MEMORY_SIZE)
196            .expect("Failed to instantiate search writer");
197        let wr = &writer;
198        tracing::info_span!("Loading search indices").in_scope(move || {
199            GlobalBackend
200                .all_archives()
201                .par_iter()
202                .filter_map(|a| match a {
203                    Archive::Local(a) => Some(a),
204                    Archive::Ext(_, _) => None,
205                })
206                .for_each(|a| {
207                    let out = a.out_dir();
208                    if out.exists() && out.is_dir() {
209                        for e in walkdir::WalkDir::new(out)
210                            .into_iter()
211                            .filter_map(Result::ok)
212                            .filter(|entry| entry.file_name() == "tantivy")
213                        {
214                            let Ok(f) = std::fs::File::open(e.path()) else {
215                                tracing::error!("error reading file {}", e.path().display());
216                                return;
217                            };
218                            let file = std::io::BufReader::new(f);
219
220                            let Ok(v): Result<Vec<SearchIndex>, _> =
221                                bincode::serde::decode_from_reader(
222                                    file,
223                                    bincode::config::standard(),
224                                )
225                            else {
226                                tracing::error!("error deserializing file {}", e.path().display());
227                                return;
228                            };
229                            for d in v {
230                                let d: tantivy::TantivyDocument = d.to_document();
231                                if let Err(e) = wr.add_document(d) {
232                                    tracing::error!("{e}");
233                                }
234                            }
235                        }
236                    }
237                });
238        });
239        match writer.commit() {
240            Ok(i) => tracing::info!("Loaded {i} entries"),
241            Err(e) => tracing::error!("Error: {e}"),
242        }
243        let slf = Searcher::get();
244        let writer = slf.writer.lock();
245        let mut old_index = slf.index.write();
246        let mut reader = slf.reader.write();
247        let Ok(r) = index.reader() else {
248            tracing::error!("Failed to instantiate search reader");
249            return;
250        };
251        *reader = r;
252        *old_index = index;
253        drop(reader);
254        drop(old_index);
255        drop(writer);
256    });
257}