Skip to main content

tex_engine/engine/mouth/
strings.rs

1use crate::prelude::*;
2use crate::tex::characters::{TextLine, TextLineSource};
3use crate::utils::errors::InvalidCharacter;
4
5/// An [`InputTokenizer`] is in one of three states
6#[derive(Copy, Clone, PartialEq, Eq, Debug)]
7pub enum MouthState {
8    /// Beginning of line
9    NewLine,
10    /// After a space (or control sequence)
11    SkipBlank,
12    /// In the middle of a line
13    MidLine,
14}
15
16/** Takes a [`TextLineSource`] and lazily turns it into [`Token`]s, given a [`CategoryCodeScheme`] and an optional
17    end-of-line [`Character`]. The primary use case is to process an input `.tex` file.
18
19  *Example:*
20```rust
21# use tex_engine::utils::errors::ErrorThrower;
22# use tex_engine::engine::mouth::strings::InputTokenizer;
23# use tex_engine::tex::tokens::{Token,StandardToken};
24# use tex_engine::tex::catcodes::DEFAULT_SCHEME_U8;
25# use tex_engine::utils::Ptr;
26# use tex_engine::tex::catcodes::CommandCode;
27# use tex_engine::tex::characters::StringLineSource;
28# use tex_engine::prelude::CSHandler;
29#
30type T = StandardToken<u8,Ptr<str>>;
31let mut cs_handler = ();
32let cc = &DEFAULT_SCHEME_U8;
33
34let string = "\\foo   \n  \n   {a}{!}";
35let input: StringLineSource<u8> = string.into();
36let mut tokenizer = InputTokenizer::new(input);
37let eol = Some(b'\r');
38let par = CSHandler::<u8,Ptr<str>>::par(&cs_handler);
39let next = tokenizer.get_next(&mut cs_handler,cc,None,&par); // \foo
40assert!(matches!(next,Ok(Some(T::ControlSequence(s))) if &*s == "foo"));
41let next = tokenizer.get_next(&mut cs_handler,cc,eol,&par); // \par
42assert!(matches!(next,Ok(Some(T::ControlSequence(s))) if &*s == "par"));
43let next : T = tokenizer.get_next(&mut cs_handler,cc,eol,&par).unwrap().unwrap(); // {
44assert_eq!(next.command_code(), CommandCode::BeginGroup);
45let next : T = tokenizer.get_next(&mut cs_handler,cc,eol,&par).unwrap().unwrap(); // a
46assert_eq!(next.command_code(), CommandCode::Letter);
47let next : T = tokenizer.get_next(&mut cs_handler,cc,eol,&par).unwrap().unwrap(); // }
48assert_eq!(next.command_code(), CommandCode::EndGroup);
49let next : T = tokenizer.get_next(&mut cs_handler,cc,eol,&par).unwrap().unwrap(); // {
50assert_eq!(next.command_code(), CommandCode::BeginGroup);
51let next : T = tokenizer.get_next(&mut cs_handler,cc,eol,&par).unwrap().unwrap(); // !
52assert_eq!(next.command_code(), CommandCode::Other);
53let next : T = tokenizer.get_next(&mut cs_handler,cc,eol,&par).unwrap().unwrap(); // }
54assert_eq!(next.command_code(), CommandCode::EndGroup);
55let next : T = tokenizer.get_next(&mut cs_handler,cc,eol,&par).unwrap().unwrap(); // end of line => space
56assert_eq!(next.command_code(), CommandCode::Space);
57assert!(tokenizer.get_next::<T>(&mut cs_handler,cc,eol,&par).unwrap().is_none()); // EOF
58```
59*/
60#[derive(Clone, Debug)]
61pub struct InputTokenizer<C: Character, S: TextLineSource<C>> {
62    pub state: MouthState,
63    line: usize,
64    col: usize,
65    current_line: TextLine<C>,
66    pub source: S,
67    pub(crate) eof: bool,
68    tempstr: Vec<C>,
69}
70
71type Csh<T> = <<T as Token>::CS as CSName<<T as Token>::Char>>::Handler;
72
73impl<C: Character, S: TextLineSource<C>> InputTokenizer<C, S> {
74    /// Create a new [`InputTokenizer`] from a [`TextLineSource`]
75    pub fn new(mut source: S) -> Self {
76        Self {
77            state: MouthState::NewLine,
78            line: 1,
79            col: 0,
80            current_line: source.get_line().unwrap_or(TextLine::default()),
81            source,
82            eof: false,
83            tempstr: Vec::new(),
84        }
85    }
86    /// The current line
87
88    pub fn line(&self) -> usize {
89        self.line
90    }
91    /// The current column
92
93    pub fn column(&self) -> usize {
94        self.col + 1
95    }
96    /// whether the file end has been reached
97
98    pub fn eof(&self) -> bool {
99        self.eof
100    }
101
102    fn get_char(&mut self) -> Option<C> {
103        if self.col >= self.current_line.len() {
104            None
105        } else {
106            let next = self.current_line[self.col];
107            self.col += 1;
108            Some(next)
109        }
110    }
111
112    /// `\readline` - read a line of input as [`Character`]s of [`CategoryCode::Other`] (except for ` `, which has
113    /// [`Space`](CategoryCode::Space)) and passing each token to the given function.
114    pub fn readline<T: Token<Char = C>, F: FnMut(T)>(&mut self, mut f: F) {
115        while self.col < self.current_line.len() {
116            let next = self.current_line[self.col];
117            self.col += 1;
118            match next.try_into() {
119                Ok(b' ') => f(T::space()),
120                _ => f(T::from_char_cat(next, CommandCode::Other)),
121            }
122        }
123        self.next_line();
124    }
125
126    /// `\read` - read a line of input as [`Character`]s in the currenct [`CategoryCodeScheme`], respecting
127    /// braces ([`CategoryCode::BeginGroup`] and [`EndGroup`](CategoryCode::EndGroup)) and passing each token to the
128    /// given function.
129    pub fn read<T: Token<Char = C>, F: FnMut(T)>(
130        &mut self,
131        handler: &mut Csh<T>,
132        cc: &CategoryCodeScheme<C>,
133        endline: Option<C>,
134        par_token: &T::CS,
135        mut f: F,
136    ) -> Result<(), InvalidCharacter<C>> {
137        let mut ingroups = 0;
138        let mut ret: Result<(), InvalidCharacter<C>> = Ok(());
139        let line = self.line;
140        while self.line == line || ingroups > 0 {
141            match self.get_char() {
142                None => {
143                    if self.eof {
144                        return ret;
145                    }
146                    if let Some(n) = self.return_endline::<T>(cc, endline, par_token) {
147                        f(n)
148                    }
149                    return ret;
150                }
151                Some(c) => match self.check_char::<T>(handler, cc, endline, c, par_token) {
152                    Ok(None) if self.line == line || ingroups > 0 => (),
153                    Ok(None) => return ret,
154                    Ok(Some(tk)) => {
155                        if tk.command_code() == CommandCode::BeginGroup {
156                            ingroups += 1
157                        } else if tk.command_code() == CommandCode::EndGroup {
158                            ingroups -= 1
159                        }
160                        f(tk)
161                    }
162                    Err(i) => {
163                        f(T::from_char_cat(i.0, CommandCode::Other));
164                        ret = Err(i)
165                    }
166                },
167            }
168        }
169        ret
170    }
171
172    /// Get the next [`Token`] from the [`InputTokenizer`] (if not empty). Throws [`InvalidCharacter`]
173    /// on encountering a character of code [`CategoryCode::Invalid`].
174    pub fn get_next<T: Token<Char = C>>(
175        &mut self,
176        handler: &mut Csh<T>,
177        cc: &CategoryCodeScheme<C>,
178        endline: Option<C>,
179        par_token: &T::CS,
180    ) -> Result<Option<T>, InvalidCharacter<C>> {
181        loop {
182            match self.get_char() {
183                None if self.eof => return Ok(None),
184                None => {
185                    if let Some(e) = self.return_endline::<T>(cc, endline, par_token) {
186                        //debug_log!(trace=>"Returning endline {}",e.printable(&interner));
187                        return Ok(Some(e));
188                    }
189                }
190                Some(c) => {
191                    if let Some(t) = self.check_char::<T>(handler, cc, endline, c, par_token)? {
192                        return Ok(Some(t));
193                    }
194                }
195            };
196        }
197    }
198
199    fn check_char<T: Token<Char = C>>(
200        &mut self,
201        handler: &mut Csh<T>,
202        cc: &CategoryCodeScheme<C>,
203        endline: Option<C>,
204        c: C,
205        par_token: &T::CS,
206    ) -> Result<Option<T>, InvalidCharacter<C>> {
207        use CategoryCode::*;
208        match cc.get(c) {
209            EOL if self.state == MouthState::NewLine => {
210                self.next_line();
211                Ok(Some(self.do_par(par_token.clone())))
212            }
213            EOL => Ok(self.return_endline::<T>(cc, endline, par_token)),
214            Space if self.state == MouthState::SkipBlank => Ok(None),
215            Space if self.state == MouthState::NewLine => Ok(None),
216            Space => {
217                self.state = MouthState::SkipBlank;
218                Ok(Some(T::space()))
219            }
220            Ignored => Ok(None),
221            Comment => {
222                self.next_line();
223                self.state = MouthState::NewLine;
224                Ok(None)
225            }
226            Invalid => Err(InvalidCharacter(c)),
227            Escape => Ok(Some(self.get_escape::<T>(handler, cc, endline))),
228            Superscript => match self.maybe_superscript(c) {
229                Some(c) => self.check_char::<T>(handler, cc, endline, c, par_token),
230                None => {
231                    self.state = MouthState::MidLine;
232                    Ok(Some(T::from_char_cat(c, CommandCode::Superscript)))
233                }
234            },
235            cc => {
236                self.state = MouthState::MidLine;
237                Ok(Some(T::from_char_cat(c, (*cc).into())))
238            }
239        }
240    }
241
242    fn next_line(&mut self) {
243        if let Some(next) = self.source.get_line() {
244            self.current_line = next;
245            self.line += 1;
246            self.col = 0;
247        } else {
248            self.eof = true;
249            self.col = self.current_line.len();
250            self.state = MouthState::MidLine;
251        }
252    }
253
254    fn do_par<T: Token<Char = C>>(&mut self, par: T::CS) -> T {
255        if self.current_line.is_empty() {
256            while let Some(line) = self.source.get_line() {
257                self.line += 1;
258                if !line.is_empty() {
259                    self.current_line = line;
260                    break;
261                }
262            }
263        }
264        T::from_cs(par)
265    }
266
267    fn return_endline<T: Token<Char = C>>(
268        &mut self,
269        cc: &CategoryCodeScheme<C>,
270        endline: Option<C>,
271        par: &T::CS,
272    ) -> Option<T> {
273        use CategoryCode::*;
274        self.next_line();
275        let ret = match endline {
276            None => None,
277            Some(c) => match cc.get(c) {
278                Space | EOL if self.state == MouthState::SkipBlank => None,
279                Space if self.state == MouthState::NewLine => None,
280                EOL if self.state == MouthState::NewLine => Some(self.do_par(par.clone())),
281                EOL => Some(T::space()),
282                Ignored | Invalid | Comment => None,
283                o => Some(T::from_char_cat(c, (*o).into())),
284            },
285        };
286        self.state = MouthState::NewLine;
287        ret
288    }
289
290    fn get_escape<T: Token<Char = C>>(
291        &mut self,
292        handler: &mut Csh<T>,
293        cc: &CategoryCodeScheme<C>,
294        endline: Option<C>,
295    ) -> T {
296        let name = match self.get_char() {
297            None => {
298                self.next_line();
299                match endline {
300                    None => handler.empty_str(),
301                    Some(c) => {
302                        self.tempstr.clear();
303                        self.tempstr.push(c);
304                        handler.cs_from_chars(&self.tempstr)
305                    }
306                }
307            }
308            Some(next) => self.check_escape::<T>(handler, cc, next),
309        };
310        T::from_cs(name)
311    }
312
313    fn check_escape<T: Token<Char = C>>(
314        &mut self,
315        handler: &mut Csh<T>,
316        cc: &CategoryCodeScheme<C>,
317        next: C,
318    ) -> T::CS {
319        use CategoryCode::*;
320        match cc.get(next) {
321            Superscript => match self.maybe_superscript(next) {
322                Some(c) => self.check_escape::<T>(handler, cc, c),
323                None => {
324                    self.state = MouthState::MidLine;
325                    self.tempstr.clear();
326                    self.tempstr.push(next);
327                    handler.cs_from_chars(&self.tempstr)
328                }
329            },
330            Letter => self.get_cs_name::<T>(handler, cc, next),
331            _ => {
332                self.state = MouthState::MidLine;
333                self.tempstr.clear();
334                self.tempstr.push(next);
335                handler.cs_from_chars(&self.tempstr)
336            }
337        }
338    }
339
340    fn get_cs_name<T: Token<Char = C>>(
341        &mut self,
342        handler: &mut Csh<T>,
343        cc: &CategoryCodeScheme<C>,
344        first: C,
345    ) -> T::CS {
346        self.tempstr.clear();
347        self.tempstr.push(first);
348        self.state = MouthState::SkipBlank;
349        loop {
350            match self.get_char() {
351                None => break,
352                Some(next) => match cc.get(next) {
353                    CategoryCode::Letter => self.tempstr.push(next),
354                    CategoryCode::Superscript => {
355                        let curr = self.col;
356                        match self.maybe_superscript(next) {
357                            Some(c) if *cc.get(c) == CategoryCode::Letter => self.tempstr.push(c),
358                            _ => {
359                                self.col = curr;
360                                self.col -= 1;
361                                break;
362                            }
363                        }
364                    }
365                    _ => {
366                        self.col -= 1;
367                        break;
368                    }
369                },
370            }
371        }
372        handler.cs_from_chars(&self.tempstr)
373    }
374
375    fn cond(i: C) -> bool {
376        (Into::<C>::into(48u8) <= i && i <= Into::<C>::into(57u8))
377            || (Into::<C>::into(97u8) <= i && i <= Into::<C>::into(102u8))
378    }
379
380    fn maybe_superscript(&mut self, firstsup: C) -> Option<C> {
381        match self.get_char() {
382            None => None,
383            Some(c) if c != firstsup => {
384                self.col -= 1;
385                None
386            }
387            Some(_) => match self.get_char() {
388                None => {
389                    self.col -= 1;
390                    None
391                }
392                Some(first) => match self.get_char() {
393                    None => {
394                        if first < (128).into() {
395                            let u: u8 = match first.try_into() {
396                                Ok(u) => u,
397                                Err(_) => {
398                                    self.col -= 1;
399                                    return None;
400                                }
401                            };
402                            let ch: C = (if u < 64 { u + 64 } else { u - 64 }).into();
403                            Some(ch)
404                        } else {
405                            self.col -= 2;
406                            None
407                        }
408                    }
409                    Some(second) => {
410                        if Self::cond(first) && Self::cond(second) {
411                            let ufirst: u8 = match first.try_into() {
412                                Ok(u) => u,
413                                Err(_) => {
414                                    self.col -= 2;
415                                    return None;
416                                }
417                            };
418                            let usecond: u8 = match second.try_into() {
419                                Ok(u) => u,
420                                Err(_) => {
421                                    self.col -= 2;
422                                    return None;
423                                }
424                            };
425                            let char = u8::from_str_radix(
426                                std::str::from_utf8(&[ufirst, usecond]).unwrap(),
427                                16,
428                            )
429                            .unwrap();
430                            Some(char.into())
431                        } else {
432                            self.col -= 1;
433                            if first < (128).into() {
434                                let u: u8 = match first.try_into() {
435                                    Ok(u) => u,
436                                    Err(_) => {
437                                        self.col -= 2;
438                                        return None;
439                                    }
440                                };
441                                let ch: C = (if u < 64 { u + 64 } else { u - 64 }).into();
442                                Some(ch)
443                            } else {
444                                self.col -= 2;
445                                None
446                            }
447                        }
448                    }
449                },
450            },
451        }
452    }
453
454    /// Only useful for debugging purposes: Print the next `len` [`Character`]s to the given [`Write`](std::fmt::Write)r.
455    pub fn preview<W: std::fmt::Write>(&self, len: &mut usize, mut f: W) -> std::fmt::Result {
456        if self.current_line.is_empty() {
457            return Ok(());
458        }
459        if self.current_line.len() > self.col {
460            for c in &self.current_line[self.col..] {
461                *len -= 1;
462                c.display_fmt(&mut f);
463                if *len == 0 {
464                    return Ok(());
465                }
466            }
467        }
468        Ok(())
469    }
470}