Skip to main content

tex_engine/tex/
characters.rs

1/*! Data structures for reading input text. */
2use crate::tex::catcodes::{CategoryCode, CategoryCodeScheme};
3use std::fmt::{Debug, Display};
4
5/** A single character in a `.tex` file; in plain TeX, this is a `u8`,
6but in e.g. XeTeX, it is a UTF-8 character. */
7pub trait Character:
8    Sized
9    + Eq
10    + Copy
11    + Display
12    + Debug
13    + From<u8>
14    + TryInto<u8>
15    + TryFrom<u64>
16    + Into<u64>
17    + Ord
18    + std::hash::Hash
19    + Default
20    + 'static
21{
22    /// Type that maps characters to other data.
23    type CharMap<A: Clone + Default>: CharacterMap<Self, A>;
24    /// Iterator over characters in a string.
25    type Iter<'a>: ExactSizeIterator<Item = Self>;
26    /// minimal value of this type in numeric form (e.g. `0` for `u8`)
27    const MIN: Self;
28    /// maximal value of this type in numeric form (e.g. `255` for `u8`)
29    const MAX: Self;
30    /// Convert a line in a file/string (as a vector of bytes) into a [`Vec`] of [`Character`]s.
31    fn convert(input: Vec<u8>) -> TextLine<Self>;
32
33    /// Display this character to a [`Write`](std::fmt::Write) (e.g. a `&mut String`). Relevant for e.g.
34    /// TeX's convention to display control characters using `^^` encoding.
35    fn display_fmt<W: std::fmt::Write>(&self, target: &mut W);
36
37    /// Convert this character to a [`DisplayableCharacter`] that calls [`display`](Self::display_fmt); useful in
38    /// `format!` and `write!` macros.
39
40    fn display(&self) -> DisplayableCharacter<Self> {
41        DisplayableCharacter(*self)
42    }
43
44    /// Convert this character to a `char`.
45    fn to_char(&self) -> char;
46
47    /// Like [`display`](Self::display), but for an [`Option`]`<`[`Character`]`>`. Useful for
48    /// `format!` and `write!` macros specifically for the current `\ecapechar` (which may or may not be defined).
49
50    fn display_opt(c: Option<Self>) -> DisplayableCharacterOpt<Self> {
51        DisplayableCharacterOpt(c)
52    }
53    /// The starting [`CategoryCodeScheme`] for this character type.
54    fn starting_catcode_scheme() -> CategoryCodeScheme<Self>;
55
56    /// Convert a string to an iterator over characters.
57    fn string_to_iter(string: &str) -> Self::Iter<'_>;
58}
59
60/// Helper structure to display a [`Character`] in a `format!` or `write!` macro.
61pub struct DisplayableCharacter<C: Character>(C);
62impl<C: Character> Display for DisplayableCharacter<C> {
63    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64        self.0.display_fmt(f);
65        Ok(())
66    }
67}
68
69/// Helper structure to display an [`Option`]`<`[`Character`]`>` in a `format!` or `write!` macro (primarily
70/// for the current `\escapechar`).
71pub struct DisplayableCharacterOpt<C: Character>(Option<C>);
72impl<C: Character> Display for DisplayableCharacterOpt<C> {
73    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74        if let Some(c) = self.0 {
75            c.display_fmt(f)
76        }
77        Ok(())
78    }
79}
80
81/** A map from characters `C:`[Character] to some other type `A`. For `u8`, we can simply use `[A;256]`.
82   For example, a [`CategoryCodeScheme`] is a [`CharacterMap`]`<`[`CategoryCode`]`>`.
83*/
84pub trait CharacterMap<C: Character, A: Default>: Clone {
85    fn get(&self, c: C) -> &A;
86    fn get_mut(&mut self, c: C) -> &mut A;
87    fn default() -> Self;
88}
89
90impl Character for u8 {
91    type CharMap<A: Clone + Default> = [A; 256];
92    const MIN: Self = 0;
93    const MAX: Self = 255;
94
95    fn to_char(&self) -> char {
96        *self as char
97    }
98
99    type Iter<'a> = ByteIterator<'a>;
100
101    fn string_to_iter(string: &str) -> Self::Iter<'_> {
102        ByteIterator(string.as_bytes())
103    }
104
105    fn convert(input: Vec<u8>) -> TextLine<Self> {
106        input.into()
107    }
108
109    #[allow(unused_must_use)]
110    fn display_fmt<W: std::fmt::Write>(&self, target: &mut W) {
111        if self.is_ascii() {
112            target.write_char(*self as char);
113        } else if *self > 128 && (*self - 64).is_ascii() {
114            target.write_str("^^");
115            target.write_char((*self - 64) as char);
116        } else {
117            target.write_str(format!("^^{:x}", *self).as_str());
118        }
119    }
120
121    fn starting_catcode_scheme() -> [CategoryCode; 256] {
122        super::catcodes::STARTING_SCHEME_U8
123    }
124}
125
126/// Iterator over bytes in a string, converting `^^` encoding to individual bytes (otherwise, we could
127/// simply use `string.as_bytes().iter()`).
128pub struct ByteIterator<'a>(&'a [u8]);
129impl<'a> Iterator for ByteIterator<'a> {
130    type Item = u8;
131    fn next(&mut self) -> Option<Self::Item> {
132        if self.0.is_empty() {
133            None
134        } else if self.0.starts_with(b"^^") {
135            let b = self.0[2];
136            if b <= 60 || self.0.len() == 3 {
137                self.0 = &self.0[3..];
138                Some(b + 64)
139            } else {
140                let r =
141                    u8::from_str_radix(std::str::from_utf8(&self.0[2..4]).unwrap(), 16).unwrap();
142                self.0 = &self.0[4..];
143                Some(r)
144            }
145        } else {
146            let b = self.0[0];
147            self.0 = &self.0[1..];
148            Some(b)
149        }
150    }
151}
152
153impl ExactSizeIterator for ByteIterator<'_> {
154    fn len(&self) -> usize {
155        let mut num = 0usize;
156        let mut iter = self.0.iter();
157        while let Some(b) = iter.next() {
158            if *b == b'^' {
159                if let Some(b'^') = iter.next() {
160                    if let Some(b) = iter.next() {
161                        if *b <= 60 {
162                            num += 1;
163                        } else {
164                            iter.next();
165                            num += 1;
166                        }
167                    } else {
168                        num += 1;
169                    }
170                } else {
171                    num += 1;
172                }
173            } else {
174                num += 1;
175            }
176        }
177        num
178    }
179}
180
181impl<A: Clone + Default> CharacterMap<u8, A> for [A; 256] {
182    fn get(&self, c: u8) -> &A {
183        &self[c as usize]
184    }
185
186    fn get_mut(&mut self, c: u8) -> &mut A {
187        &mut self[c as usize]
188    }
189    fn default() -> Self {
190        array_init::array_init(|_| A::default())
191    }
192}
193
194/// A single line of characters.
195pub type TextLine<C> = Box<[C]>;
196
197/// A source of lines of characters, e.g. a file or a string.
198pub trait TextLineSource<C: Character> {
199    /// returns the next line of characters, or `None` if there are no more lines.
200    fn get_line(&mut self) -> Option<TextLine<C>>;
201}
202/// A source of lines of characters generated from a string.
203pub struct StringLineSource<C: Character> {
204    pub lines: std::vec::IntoIter<TextLine<C>>,
205}
206impl<C: Character> StringLineSource<C> {
207    /// Obtain the characters in a string: Takes a byte iterator as input and returns a vector of [`TextLine`]s;
208    /// (split at `\n` or `\r\n`, removing trailing spaces).
209    pub fn make_lines<I: Iterator<Item = u8>>(iter: I) -> Vec<TextLine<C>> {
210        let mut lines = Vec::new();
211        let mut curr = Vec::new();
212        for b in iter {
213            if b == b'\n' {
214                if let Some(b'\r') = curr.last() {
215                    curr.pop();
216                }
217                while let Some(b' ') = curr.last() {
218                    curr.pop();
219                }
220                lines.push(C::convert(std::mem::take(&mut curr)));
221            } else {
222                curr.push(b);
223            }
224        }
225        if !curr.is_empty() {
226            lines.push(C::convert(curr));
227        }
228        lines
229    }
230}
231impl<C: Character> From<Vec<TextLine<C>>> for StringLineSource<C> {
232    fn from(lines: Vec<TextLine<C>>) -> Self {
233        Self {
234            lines: lines.into_iter(),
235        }
236    }
237}
238impl<C: Character> TextLineSource<C> for StringLineSource<C> {
239    fn get_line(&mut self) -> Option<TextLine<C>> {
240        self.lines.next()
241    }
242}
243impl<C: Character> From<&str> for StringLineSource<C> {
244    fn from(s: &str) -> Self {
245        Self::make_lines(s.as_bytes().iter().copied()).into()
246    }
247}
248impl<C: Character> From<String> for StringLineSource<C> {
249    fn from(s: String) -> Self {
250        Self::make_lines(s.into_bytes().into_iter()).into()
251    }
252}