Skip to main content

tex_engine/tex/
characters.rs

1/*! Data structures for reading input text. */
2use crate::tex::catcodes::{CategoryCode, CategoryCodeScheme};
3use std::fmt::{Debug, Display};
4
5/** A single character in a `.tex` file; in plain TeX, this is a `u8`,
6but in e.g. XeTeX, it is a UTF-8 character. */
7pub trait Character:
8    Sized
9    + Eq
10    + Copy
11    + Display
12    + Debug
13    + From<u8>
14    + TryInto<u8>
15    + TryFrom<u64>
16    + Into<u64>
17    + Ord
18    + std::hash::Hash
19    + Default
20    + 'static
21{
22    /// Type that maps characters to other data.
23    type CharMap<A: Clone + Default>: CharacterMap<Self, A>;
24    /// Iterator over characters in a string.
25    type Iter<'a>: ExactSizeIterator<Item = Self>;
26    /// minimal value of this type in numeric form (e.g. `0` for `u8`)
27    const MIN: Self;
28    /// maximal value of this type in numeric form (e.g. `255` for `u8`)
29    const MAX: Self;
30    /// Convert a line in a file/string (as a vector of bytes) into a [`Vec`] of [`Character`]s.
31    fn convert(input: Vec<u8>) -> TextLine<Self>;
32
33    fn slice_from_str<R>(s: &str, then: impl FnOnce(&[Self]) -> R) -> R;
34
35    /// Display this character to a [`Write`](std::fmt::Write) (e.g. a `&mut String`). Relevant for e.g.
36    /// TeX's convention to display control characters using `^^` encoding.
37    fn display_fmt<W: std::fmt::Write>(&self, target: &mut W);
38
39    /// Convert this character to a [`DisplayableCharacter`] that calls [`display`](Self::display_fmt); useful in
40    /// `format!` and `write!` macros.
41
42    fn display(&self) -> DisplayableCharacter<Self> {
43        DisplayableCharacter(*self)
44    }
45
46    /// Convert this character to a `char`.
47    fn to_char(&self) -> char;
48
49    /// Like [`display`](Self::display), but for an [`Option`]`<`[`Character`]`>`. Useful for
50    /// `format!` and `write!` macros specifically for the current `\ecapechar` (which may or may not be defined).
51
52    fn display_opt(c: Option<Self>) -> DisplayableCharacterOpt<Self> {
53        DisplayableCharacterOpt(c)
54    }
55    /// The starting [`CategoryCodeScheme`] for this character type.
56    fn starting_catcode_scheme() -> CategoryCodeScheme<Self>;
57
58    /// Convert a string to an iterator over characters.
59    fn string_to_iter(string: &str) -> Self::Iter<'_>;
60}
61
62/// Helper structure to display a [`Character`] in a `format!` or `write!` macro.
63pub struct DisplayableCharacter<C: Character>(C);
64impl<C: Character> Display for DisplayableCharacter<C> {
65    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
66        self.0.display_fmt(f);
67        Ok(())
68    }
69}
70
71/// Helper structure to display an [`Option`]`<`[`Character`]`>` in a `format!` or `write!` macro (primarily
72/// for the current `\escapechar`).
73pub struct DisplayableCharacterOpt<C: Character>(Option<C>);
74impl<C: Character> Display for DisplayableCharacterOpt<C> {
75    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
76        if let Some(c) = self.0 {
77            c.display_fmt(f)
78        }
79        Ok(())
80    }
81}
82
83/** A map from characters `C:`[Character] to some other type `A`. For `u8`, we can simply use `[A;256]`.
84   For example, a [`CategoryCodeScheme`] is a [`CharacterMap`]`<`[`CategoryCode`]`>`.
85*/
86pub trait CharacterMap<C: Character, A: Default>: Clone {
87    fn get(&self, c: C) -> &A;
88    fn get_mut(&mut self, c: C) -> &mut A;
89    fn default() -> Self;
90}
91
92impl Character for u8 {
93    type CharMap<A: Clone + Default> = [A; 256];
94    const MIN: Self = 0;
95    const MAX: Self = 255;
96
97    fn to_char(&self) -> char {
98        *self as char
99    }
100
101    type Iter<'a> = ByteIterator<'a>;
102
103    fn slice_from_str<R>(s: &str, then: impl FnOnce(&[Self]) -> R) -> R {
104        if s.contains("^^") {
105            then(&Self::string_to_iter(s).collect::<Vec<_>>())
106        } else {
107            then(s.as_bytes())
108        }
109    }
110
111    fn string_to_iter(string: &str) -> Self::Iter<'_> {
112        ByteIterator(string.as_bytes())
113    }
114
115    fn convert(input: Vec<u8>) -> TextLine<Self> {
116        input.into()
117    }
118
119    #[allow(unused_must_use)]
120    fn display_fmt<W: std::fmt::Write>(&self, target: &mut W) {
121        if self.is_ascii() {
122            target.write_char(*self as char);
123        } else if *self > 128 && (*self - 64).is_ascii() {
124            target.write_str("^^");
125            target.write_char((*self - 64) as char);
126        } else {
127            target.write_str(format!("^^{:x}", *self).as_str());
128        }
129    }
130
131    fn starting_catcode_scheme() -> [CategoryCode; 256] {
132        super::catcodes::STARTING_SCHEME_U8
133    }
134}
135
136/// Iterator over bytes in a string, converting `^^` encoding to individual bytes (otherwise, we could
137/// simply use `string.as_bytes().iter()`).
138pub struct ByteIterator<'a>(&'a [u8]);
139impl<'a> Iterator for ByteIterator<'a> {
140    type Item = u8;
141    fn next(&mut self) -> Option<Self::Item> {
142        if self.0.is_empty() {
143            None
144        } else if self.0.starts_with(b"^^") {
145            let b = self.0[2];
146            if b <= 60 || self.0.len() == 3 {
147                self.0 = &self.0[3..];
148                Some(b + 64)
149            } else {
150                let r =
151                    u8::from_str_radix(std::str::from_utf8(&self.0[2..4]).unwrap(), 16).unwrap();
152                self.0 = &self.0[4..];
153                Some(r)
154            }
155        } else {
156            let b = self.0[0];
157            self.0 = &self.0[1..];
158            Some(b)
159        }
160    }
161}
162
163impl ExactSizeIterator for ByteIterator<'_> {
164    fn len(&self) -> usize {
165        let mut num = 0usize;
166        let mut iter = self.0.iter();
167        while let Some(b) = iter.next() {
168            if *b == b'^' {
169                if let Some(b'^') = iter.next() {
170                    if let Some(b) = iter.next() {
171                        if *b <= 60 {
172                            num += 1;
173                        } else {
174                            iter.next();
175                            num += 1;
176                        }
177                    } else {
178                        num += 1;
179                    }
180                } else {
181                    num += 1;
182                }
183            } else {
184                num += 1;
185            }
186        }
187        num
188    }
189}
190
191impl<A: Clone + Default> CharacterMap<u8, A> for [A; 256] {
192    fn get(&self, c: u8) -> &A {
193        &self[c as usize]
194    }
195
196    fn get_mut(&mut self, c: u8) -> &mut A {
197        &mut self[c as usize]
198    }
199    fn default() -> Self {
200        array_init::array_init(|_| A::default())
201    }
202}
203
204/// A single line of characters.
205pub type TextLine<C> = Box<[C]>;
206
207/// A source of lines of characters, e.g. a file or a string.
208pub trait TextLineSource<C: Character> {
209    /// returns the next line of characters, or `None` if there are no more lines.
210    fn get_line(&mut self) -> Option<TextLine<C>>;
211}
212/// A source of lines of characters generated from a string.
213pub struct StringLineSource<C: Character> {
214    pub lines: std::vec::IntoIter<TextLine<C>>,
215}
216impl<C: Character> StringLineSource<C> {
217    /// Obtain the characters in a string: Takes a byte iterator as input and returns a vector of [`TextLine`]s;
218    /// (split at `\n` or `\r\n`, removing trailing spaces).
219    pub fn make_lines<I: Iterator<Item = u8>>(iter: I) -> Vec<TextLine<C>> {
220        let mut lines = Vec::new();
221        let mut curr = Vec::new();
222        for b in iter {
223            if b == b'\n' {
224                if let Some(b'\r') = curr.last() {
225                    curr.pop();
226                }
227                while let Some(b' ') = curr.last() {
228                    curr.pop();
229                }
230                lines.push(C::convert(std::mem::take(&mut curr)));
231            } else {
232                curr.push(b);
233            }
234        }
235        if !curr.is_empty() {
236            lines.push(C::convert(curr));
237        }
238        lines
239    }
240}
241impl<C: Character> From<Vec<TextLine<C>>> for StringLineSource<C> {
242    fn from(lines: Vec<TextLine<C>>) -> Self {
243        Self {
244            lines: lines.into_iter(),
245        }
246    }
247}
248impl<C: Character> TextLineSource<C> for StringLineSource<C> {
249    fn get_line(&mut self) -> Option<TextLine<C>> {
250        self.lines.next()
251    }
252}
253impl<C: Character> From<&str> for StringLineSource<C> {
254    fn from(s: &str) -> Self {
255        Self::make_lines(s.as_bytes().iter().copied()).into()
256    }
257}
258impl<C: Character> From<String> for StringLineSource<C> {
259    fn from(s: String) -> Self {
260        Self::make_lines(s.into_bytes().into_iter()).into()
261    }
262}