Skip to main content

tex_engine/engine/mouth/
strings.rs

1use crate::prelude::*;
2use crate::tex::characters::{TextLine, TextLineSource};
3use crate::utils::errors::InvalidCharacter;
4
5/// An [`InputTokenizer`] is in one of three states
6#[derive(Copy, Clone, PartialEq, Eq, Debug)]
7pub enum MouthState {
8    /// Beginning of line
9    NewLine,
10    /// After a space (or control sequence)
11    SkipBlank,
12    /// In the middle of a line
13    MidLine,
14}
15
16/** Takes a [`TextLineSource`] and lazily turns it into [`Token`]s, given a [`CategoryCodeScheme`] and an optional
17    end-of-line [`Character`]. The primary use case is to process an input `.tex` file.
18
19  *Example:*
20```rust
21# use tex_engine::utils::errors::ErrorThrower;
22# use tex_engine::engine::mouth::strings::InputTokenizer;
23# use tex_engine::tex::tokens::{Token,StandardToken};
24# use tex_engine::tex::catcodes::DEFAULT_SCHEME_U8;
25# use tex_engine::utils::Ptr;
26# use tex_engine::tex::catcodes::CommandCode;
27# use tex_engine::tex::characters::StringLineSource;
28#
29type T = StandardToken<u8,Ptr<str>>;
30let mut cs_handler = ();
31let cc = &DEFAULT_SCHEME_U8;
32
33let string = "\\foo   \n  \n   {a}{!}";
34let input: StringLineSource<u8> = string.into();
35let mut tokenizer = InputTokenizer::new(input);
36let eol = Some(b'\r');
37let next = tokenizer.get_next(&mut cs_handler,cc,None); // \foo
38assert!(matches!(next,Ok(Some(T::ControlSequence(s))) if &*s == "foo"));
39let next = tokenizer.get_next(&mut cs_handler,cc,eol); // \par
40assert!(matches!(next,Ok(Some(T::ControlSequence(s))) if &*s == "par"));
41let next : T = tokenizer.get_next(&mut cs_handler,cc,eol).unwrap().unwrap(); // {
42assert_eq!(next.command_code(), CommandCode::BeginGroup);
43let next : T = tokenizer.get_next(&mut cs_handler,cc,eol).unwrap().unwrap(); // a
44assert_eq!(next.command_code(), CommandCode::Letter);
45let next : T = tokenizer.get_next(&mut cs_handler,cc,eol).unwrap().unwrap(); // }
46assert_eq!(next.command_code(), CommandCode::EndGroup);
47let next : T = tokenizer.get_next(&mut cs_handler,cc,eol).unwrap().unwrap(); // {
48assert_eq!(next.command_code(), CommandCode::BeginGroup);
49let next : T = tokenizer.get_next(&mut cs_handler,cc,eol).unwrap().unwrap(); // !
50assert_eq!(next.command_code(), CommandCode::Other);
51let next : T = tokenizer.get_next(&mut cs_handler,cc,eol).unwrap().unwrap(); // }
52assert_eq!(next.command_code(), CommandCode::EndGroup);
53let next : T = tokenizer.get_next(&mut cs_handler,cc,eol).unwrap().unwrap(); // end of line => space
54assert_eq!(next.command_code(), CommandCode::Space);
55assert!(tokenizer.get_next::<T>(&mut cs_handler,cc,eol).unwrap().is_none()); // EOF
56```
57*/
58#[derive(Clone, Debug)]
59pub struct InputTokenizer<C: Character, S: TextLineSource<C>> {
60    pub state: MouthState,
61    line: usize,
62    col: usize,
63    current_line: TextLine<C>,
64    pub source: S,
65    pub(crate) eof: bool,
66    tempstr: Vec<C>,
67}
68
69type Csh<T> = <<T as Token>::CS as CSName<<T as Token>::Char>>::Handler;
70
71impl<C: Character, S: TextLineSource<C>> InputTokenizer<C, S> {
72    /// Create a new [`InputTokenizer`] from a [`TextLineSource`]
73    pub fn new(mut source: S) -> Self {
74        Self {
75            state: MouthState::NewLine,
76            line: 1,
77            col: 0,
78            current_line: source.get_line().unwrap_or(TextLine::default()),
79            source,
80            eof: false,
81            tempstr: Vec::new(),
82        }
83    }
84    /// The current line
85
86    pub fn line(&self) -> usize {
87        self.line
88    }
89    /// The current column
90
91    pub fn column(&self) -> usize {
92        self.col + 1
93    }
94    /// whether the file end has been reached
95
96    pub fn eof(&self) -> bool {
97        self.eof
98    }
99
100    fn get_char(&mut self) -> Option<C> {
101        if self.col >= self.current_line.len() {
102            None
103        } else {
104            let next = self.current_line[self.col];
105            self.col += 1;
106            Some(next)
107        }
108    }
109
110    /// `\readline` - read a line of input as [`Character`]s of [`CategoryCode::Other`] (except for ` `, which has
111    /// [`Space`](CategoryCode::Space)) and passing each token to the given function.
112    pub fn readline<T: Token<Char = C>, F: FnMut(T)>(&mut self, mut f: F) {
113        while self.col < self.current_line.len() {
114            let next = self.current_line[self.col];
115            self.col += 1;
116            match next.try_into() {
117                Ok(b' ') => f(T::space()),
118                _ => f(T::from_char_cat(next, CommandCode::Other)),
119            }
120        }
121        self.next_line();
122    }
123
124    /// `\read` - read a line of input as [`Character`]s in the currenct [`CategoryCodeScheme`], respecting
125    /// braces ([`CategoryCode::BeginGroup`] and [`EndGroup`](CategoryCode::EndGroup)) and passing each token to the
126    /// given function.
127    pub fn read<T: Token<Char = C>, F: FnMut(T)>(
128        &mut self,
129        handler: &mut Csh<T>,
130        cc: &CategoryCodeScheme<C>,
131        endline: Option<C>,
132        mut f: F,
133    ) -> Result<(), InvalidCharacter<C>> {
134        let mut ingroups = 0;
135        let mut ret: Result<(), InvalidCharacter<C>> = Ok(());
136        let line = self.line;
137        while self.line == line || ingroups > 0 {
138            match self.get_char() {
139                None => {
140                    if self.eof {
141                        return ret;
142                    }
143                    if let Some(n) = self.return_endline::<T>(cc, endline, handler.par()) {
144                        f(n)
145                    }
146                    return ret;
147                }
148                Some(c) => match self.check_char::<T>(handler, cc, endline, c) {
149                    Ok(None) if self.line == line || ingroups > 0 => (),
150                    Ok(None) => return ret,
151                    Ok(Some(tk)) => {
152                        if tk.command_code() == CommandCode::BeginGroup {
153                            ingroups += 1
154                        } else if tk.command_code() == CommandCode::EndGroup {
155                            ingroups -= 1
156                        }
157                        f(tk)
158                    }
159                    Err(i) => {
160                        f(T::from_char_cat(i.0, CommandCode::Other));
161                        ret = Err(i)
162                    }
163                },
164            }
165        }
166        ret
167    }
168
169    /// Get the next [`Token`] from the [`InputTokenizer`] (if not empty). Throws [`InvalidCharacter`]
170    /// on encountering a character of code [`CategoryCode::Invalid`].
171    pub fn get_next<T: Token<Char = C>>(
172        &mut self,
173        handler: &mut Csh<T>,
174        cc: &CategoryCodeScheme<C>,
175        endline: Option<C>,
176    ) -> Result<Option<T>, InvalidCharacter<C>> {
177        loop {
178            match self.get_char() {
179                None if self.eof => return Ok(None),
180                None => {
181                    if let Some(e) = self.return_endline::<T>(cc, endline, handler.par()) {
182                        //debug_log!(trace=>"Returning endline {}",e.printable(&interner));
183                        return Ok(Some(e));
184                    }
185                }
186                Some(c) => {
187                    if let Some(t) = self.check_char::<T>(handler, cc, endline, c)? {
188                        return Ok(Some(t));
189                    }
190                }
191            };
192        }
193    }
194
195    fn check_char<T: Token<Char = C>>(
196        &mut self,
197        handler: &mut Csh<T>,
198        cc: &CategoryCodeScheme<C>,
199        endline: Option<C>,
200        c: C,
201    ) -> Result<Option<T>, InvalidCharacter<C>> {
202        use CategoryCode::*;
203        match cc.get(c) {
204            EOL if self.state == MouthState::NewLine => {
205                self.next_line();
206                Ok(Some(self.do_par(handler.par())))
207            }
208            EOL => Ok(self.return_endline::<T>(cc, endline, handler.par())),
209            Space if self.state == MouthState::SkipBlank => Ok(None),
210            Space if self.state == MouthState::NewLine => Ok(None),
211            Space => {
212                self.state = MouthState::SkipBlank;
213                Ok(Some(T::space()))
214            }
215            Ignored => Ok(None),
216            Comment => {
217                self.next_line();
218                self.state = MouthState::NewLine;
219                Ok(None)
220            }
221            Invalid => Err(InvalidCharacter(c)),
222            Escape => Ok(Some(self.get_escape::<T>(handler, cc, endline))),
223            Superscript => match self.maybe_superscript(c) {
224                Some(c) => self.check_char::<T>(handler, cc, endline, c),
225                None => {
226                    self.state = MouthState::MidLine;
227                    Ok(Some(T::from_char_cat(c, CommandCode::Superscript)))
228                }
229            },
230            cc => {
231                self.state = MouthState::MidLine;
232                Ok(Some(T::from_char_cat(c, (*cc).into())))
233            }
234        }
235    }
236
237    fn next_line(&mut self) {
238        if let Some(next) = self.source.get_line() {
239            self.current_line = next;
240            self.line += 1;
241            self.col = 0;
242        } else {
243            self.eof = true;
244            self.col = self.current_line.len();
245            self.state = MouthState::MidLine;
246        }
247    }
248
249    fn do_par<T: Token<Char = C>>(&mut self, par: T::CS) -> T {
250        if self.current_line.is_empty() {
251            while let Some(line) = self.source.get_line() {
252                self.line += 1;
253                if !line.is_empty() {
254                    self.current_line = line;
255                    break;
256                }
257            }
258        }
259        T::from_cs(par)
260    }
261
262    fn return_endline<T: Token<Char = C>>(
263        &mut self,
264        cc: &CategoryCodeScheme<C>,
265        endline: Option<C>,
266        par: T::CS,
267    ) -> Option<T> {
268        use CategoryCode::*;
269        self.next_line();
270        let ret = match endline {
271            None => None,
272            Some(c) => match cc.get(c) {
273                Space | EOL if self.state == MouthState::SkipBlank => None,
274                Space if self.state == MouthState::NewLine => None,
275                EOL if self.state == MouthState::NewLine => Some(self.do_par(par)),
276                EOL => Some(T::space()),
277                Ignored | Invalid | Comment => None,
278                o => Some(T::from_char_cat(c, (*o).into())),
279            },
280        };
281        self.state = MouthState::NewLine;
282        ret
283    }
284
285    fn get_escape<T: Token<Char = C>>(
286        &mut self,
287        handler: &mut Csh<T>,
288        cc: &CategoryCodeScheme<C>,
289        endline: Option<C>,
290    ) -> T {
291        let name = match self.get_char() {
292            None => {
293                self.next_line();
294                match endline {
295                    None => handler.empty_str(),
296                    Some(c) => {
297                        self.tempstr.clear();
298                        self.tempstr.push(c);
299                        handler.cs_from_chars(&self.tempstr)
300                    }
301                }
302            }
303            Some(next) => self.check_escape::<T>(handler, cc, next),
304        };
305        T::from_cs(name)
306    }
307
308    fn check_escape<T: Token<Char = C>>(
309        &mut self,
310        handler: &mut Csh<T>,
311        cc: &CategoryCodeScheme<C>,
312        next: C,
313    ) -> T::CS {
314        use CategoryCode::*;
315        match cc.get(next) {
316            Superscript => match self.maybe_superscript(next) {
317                Some(c) => self.check_escape::<T>(handler, cc, c),
318                None => {
319                    self.state = MouthState::MidLine;
320                    self.tempstr.clear();
321                    self.tempstr.push(next);
322                    handler.cs_from_chars(&self.tempstr)
323                }
324            },
325            Letter => self.get_cs_name::<T>(handler, cc, next),
326            _ => {
327                self.state = MouthState::MidLine;
328                self.tempstr.clear();
329                self.tempstr.push(next);
330                handler.cs_from_chars(&self.tempstr)
331            }
332        }
333    }
334
335    fn get_cs_name<T: Token<Char = C>>(
336        &mut self,
337        handler: &mut Csh<T>,
338        cc: &CategoryCodeScheme<C>,
339        first: C,
340    ) -> T::CS {
341        self.tempstr.clear();
342        self.tempstr.push(first);
343        self.state = MouthState::SkipBlank;
344        loop {
345            match self.get_char() {
346                None => break,
347                Some(next) => match cc.get(next) {
348                    CategoryCode::Letter => self.tempstr.push(next),
349                    CategoryCode::Superscript => {
350                        let curr = self.col;
351                        match self.maybe_superscript(next) {
352                            Some(c) if *cc.get(c) == CategoryCode::Letter => self.tempstr.push(c),
353                            _ => {
354                                self.col = curr;
355                                self.col -= 1;
356                                break;
357                            }
358                        }
359                    }
360                    _ => {
361                        self.col -= 1;
362                        break;
363                    }
364                },
365            }
366        }
367        handler.cs_from_chars(&self.tempstr)
368    }
369
370    fn cond(i: C) -> bool {
371        (Into::<C>::into(48u8) <= i && i <= Into::<C>::into(57u8))
372            || (Into::<C>::into(97u8) <= i && i <= Into::<C>::into(102u8))
373    }
374
375    fn maybe_superscript(&mut self, firstsup: C) -> Option<C> {
376        match self.get_char() {
377            None => None,
378            Some(c) if c != firstsup => {
379                self.col -= 1;
380                None
381            }
382            Some(_) => match self.get_char() {
383                None => {
384                    self.col -= 1;
385                    None
386                }
387                Some(first) => match self.get_char() {
388                    None => {
389                        if first < (128).into() {
390                            let u: u8 = match first.try_into() {
391                                Ok(u) => u,
392                                Err(_) => {
393                                    self.col -= 1;
394                                    return None;
395                                }
396                            };
397                            let ch: C = (if u < 64 { u + 64 } else { u - 64 }).into();
398                            Some(ch)
399                        } else {
400                            self.col -= 2;
401                            None
402                        }
403                    }
404                    Some(second) => {
405                        if Self::cond(first) && Self::cond(second) {
406                            let ufirst: u8 = match first.try_into() {
407                                Ok(u) => u,
408                                Err(_) => {
409                                    self.col -= 2;
410                                    return None;
411                                }
412                            };
413                            let usecond: u8 = match second.try_into() {
414                                Ok(u) => u,
415                                Err(_) => {
416                                    self.col -= 2;
417                                    return None;
418                                }
419                            };
420                            let char = u8::from_str_radix(
421                                std::str::from_utf8(&[ufirst, usecond]).unwrap(),
422                                16,
423                            )
424                            .unwrap();
425                            Some(char.into())
426                        } else {
427                            self.col -= 1;
428                            if first < (128).into() {
429                                let u: u8 = match first.try_into() {
430                                    Ok(u) => u,
431                                    Err(_) => {
432                                        self.col -= 2;
433                                        return None;
434                                    }
435                                };
436                                let ch: C = (if u < 64 { u + 64 } else { u - 64 }).into();
437                                Some(ch)
438                            } else {
439                                self.col -= 2;
440                                None
441                            }
442                        }
443                    }
444                },
445            },
446        }
447    }
448
449    /// Only useful for debugging purposes: Print the next `len` [`Character`]s to the given [`Write`](std::fmt::Write)r.
450    pub fn preview<W: std::fmt::Write>(&self, len: &mut usize, mut f: W) -> std::fmt::Result {
451        if self.current_line.is_empty() {
452            return Ok(());
453        }
454        if self.current_line.len() > self.col {
455            for c in &self.current_line[self.col..] {
456                *len -= 1;
457                c.display_fmt(&mut f);
458                if *len == 0 {
459                    return Ok(());
460                }
461            }
462        }
463        Ok(())
464    }
465}