Skip to main content

tex_engine/tex/
tokens.rs

1/*! A [Token] is -- conceptually -- either a [control sequence](CSName),
2or a pair of a [character](Character) and a [`CategoryCode`](super::catcodes::CategoryCode). In practice, we use
3[`CommandCode`] instead, which omits "impossible" codes (e.g. [`Invalid`](super::catcodes::CategoryCode::Invalid) or
4[`Comment`](super::catcodes::CategoryCode::Comment)) and adds internal ones (e.g. [`Primitive`](CommandCode::Primitive) or
5[`Argument`](CommandCode::Argument)).
6
7The "canonical" way to represent a [`Token`] is [`StandardToken`], which is an enum with two variants.
8However, since [`Token`]s are read, processed, inspected, passed around, stored and retrieved extremely often, and in
9the most performance-critical parts of the engine, their representation matters. In particular, we want them to be small
10and ideally `Copy`, which excludes representing control sequences as strings; hence the generic [`CSName`] type
11and [`CompactToken`] as a significantly more efficient representation.
12 */
13
14use crate::commands::primitives::PrimitiveIdentifier;
15use crate::tex::catcodes::{CategoryCodeScheme, CommandCode};
16use crate::tex::characters::Character;
17use crate::tex::tokens::control_sequences::{CSName, InternedCSName};
18use std::fmt::Write;
19use std::marker::PhantomData;
20use std::num::NonZeroU32;
21
22pub mod control_sequences;
23pub mod token_lists;
24
25/// Trait for Tokens, to be implemented for an engine (see [above](crate::tex::tokens)).
26/// Note that two [`Space`](CommandCode::Space) tokens are always considered equal.
27pub trait Token: Clone + Eq + 'static + std::fmt::Debug + Sized {
28    /// The [`CSName`] type used for control sequence names (e.g. `Rc<str>` or something interned).
29    type CS: CSName<Self::Char>;
30    /// The [`Character`] type for char/catcode-pair tokens.
31    type Char: Character;
32    /// Converts to the canonical enum representation of a token, i.e. [`StandardToken`].
33    fn to_enum(&self) -> StandardToken<Self::Char, Self::CS>;
34    /// Create a new token from a control sequence name.
35    fn from_cs(cs: Self::CS) -> Self;
36    /// Create a new space token.
37    fn space() -> Self;
38
39    /// Create a new token representing a [primitive](PrimitiveIdentifier) [`Command`](crate::commands::TeXCommand).
40    fn primitive(id: PrimitiveIdentifier) -> Self;
41    /// Create a new argument marker token. `i` needs to be in the range `0..=8`.
42    fn argument_marker(i: u8) -> Self;
43    /// Create a new end-of-file token.
44    fn eof() -> Self;
45    /// Create a new character token with given [`CommandCode`] (i.e.
46    /// conceptually the [`CategoryCode`](super::catcodes::CategoryCode)).
47    fn from_char_cat(c: Self::Char, cat: CommandCode) -> Self;
48    /// The [`Character`] value of this token, if it is a character token.
49
50    fn char_value(&self) -> Option<Self::Char> {
51        match self.to_enum() {
52            StandardToken::Character(c, _) => Some(c),
53            _ => None,
54        }
55    }
56    /// The [`CommandCode`] (i.e. conceptually the [`CategoryCode`](super::catcodes::CategoryCode)) of this token.
57
58    fn command_code(&self) -> CommandCode {
59        match self.to_enum() {
60            StandardToken::ControlSequence(_) => CommandCode::Escape,
61            StandardToken::Character(_, cat) => cat,
62            StandardToken::Primitive(_) => CommandCode::Primitive,
63        }
64    }
65
66    /// Check if this token is a control sequence or an active character
67
68    fn is_cs_or_active(&self) -> bool {
69        matches!(
70            self.to_enum(),
71            StandardToken::ControlSequence(_)
72                | StandardToken::Character(_, CommandCode::Active)
73                | StandardToken::Primitive(_)
74        )
75    }
76
77    /// Check if this token is a control sequence with the given name.
78
79    fn is_cs(&self, name: &Self::CS) -> bool {
80        match self.to_enum() {
81            StandardToken::ControlSequence(cs) => cs == *name,
82            _ => false,
83        }
84    }
85    fn is_primitive(&self) -> Option<PrimitiveIdentifier> {
86        match self.to_enum() {
87            StandardToken::Primitive(id) => Some(id),
88            _ => None,
89        }
90    }
91    /// Check if this token is a argument token, and if so, return its number (in the range `0..=8`).
92    fn is_argument_marker(&self) -> Option<u8> {
93        match self.to_enum() {
94            StandardToken::Character(c, CommandCode::Argument) => Some(c.try_into().ok().unwrap()),
95            _ => None,
96        }
97    }
98
99    /// Display this token to a writer, using the given [`CSHandler`](control_sequences::CSHandler) (in case it is a control sequence).
100    /// In that case, we also need the current `\escapechar` to optionally insert it in front of the control sequence
101    /// name, and the current [`CategoryCodeScheme`] to determine whether or not to insert a space afterwards - which
102    /// we do unless the control sequence name is a single character with any [`CommandCode`] other than
103    /// [`Letter`](CommandCode::Letter).
104    fn display_fmt<W: Write>(
105        &self,
106        int: &<Self::CS as CSName<Self::Char>>::Handler,
107        cc: &CategoryCodeScheme<Self::Char>,
108        escapechar: Option<Self::Char>,
109        f: &mut W,
110    ) -> std::fmt::Result {
111        match self.to_enum() {
112            StandardToken::Character(_, CommandCode::Space) => f.write_char(' '),
113            StandardToken::Character(c, _) => {
114                c.display_fmt(f);
115                Ok(())
116            }
117            StandardToken::ControlSequence(cs) => cs.display_fmt(int, cc, escapechar, f),
118            StandardToken::Primitive(id) => write!(
119                f,
120                "{}pdfprimitive {}",
121                Self::Char::display_opt(escapechar),
122                id.display(escapechar)
123            ),
124        }
125    }
126    /// Returns a helper struct implementing [`Display`](std::fmt::Display) for this token.
127    fn display<'a>(
128        &'a self,
129        int: &'a <Self::CS as CSName<Self::Char>>::Handler,
130        cc: &'a CategoryCodeScheme<Self::Char>,
131        escapechar: Option<Self::Char>,
132    ) -> DisplayToken<'a, Self> {
133        DisplayToken {
134            tk: self,
135            int,
136            cc,
137            escapechar,
138        }
139    }
140}
141
142pub struct DisplayToken<'a, T: Token> {
143    tk: &'a T,
144    int: &'a <T::CS as CSName<T::Char>>::Handler,
145    cc: &'a CategoryCodeScheme<T::Char>,
146    escapechar: Option<T::Char>,
147}
148impl<'a, T: Token> std::fmt::Display for DisplayToken<'a, T> {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        self.tk.display_fmt(self.int, self.cc, self.escapechar, f)
151    }
152}
153
154/** The simplest (but not most efficient) way to represent a [`Token`] as an enum.
155
156Is [`Copy`] iff [`CS`](Token::CS) is [`Copy`].
157*/
158#[derive(Clone, Copy, Eq, Debug)]
159pub enum StandardToken<Char: Character, CS: CSName<Char>> {
160    ControlSequence(CS),
161    Character(Char, CommandCode),
162    Primitive(PrimitiveIdentifier),
163}
164impl<Char: Character, CS: CSName<Char>> PartialEq for StandardToken<Char, CS> {
165    fn eq(&self, other: &Self) -> bool {
166        match (self, other) {
167            (StandardToken::ControlSequence(a), StandardToken::ControlSequence(b)) => a == b,
168            (
169                StandardToken::Character(_, CommandCode::Space),
170                StandardToken::Character(_, CommandCode::Space),
171            ) => true,
172            (StandardToken::Character(a1, a2), StandardToken::Character(b1, b2)) => {
173                a1 == b1 && a2 == b2
174            }
175            (StandardToken::Primitive(a), StandardToken::Primitive(b)) => a == b,
176            _ => false,
177        }
178    }
179}
180impl<Char: Character, CS: CSName<Char>> Token for StandardToken<Char, CS> {
181    type CS = CS;
182    type Char = Char;
183
184    fn to_enum(&self) -> StandardToken<Char, CS> {
185        self.clone()
186    }
187
188    fn from_cs(cs: CS) -> Self {
189        StandardToken::ControlSequence(cs)
190    }
191
192    fn space() -> Self {
193        StandardToken::Character(Char::from(32), CommandCode::Space)
194    }
195
196    fn eof() -> Self {
197        StandardToken::Character(Char::from(0), CommandCode::EOF)
198    }
199
200    fn from_char_cat(c: Char, cat: CommandCode) -> Self {
201        StandardToken::Character(c, cat)
202    }
203
204    fn primitive(id: PrimitiveIdentifier) -> Self {
205        Self::Primitive(id)
206    }
207
208    fn argument_marker(i: u8) -> Self {
209        Self::Character(Char::from(i), CommandCode::Argument)
210    }
211}
212
213/** A compact representation of a [`Token`] with [`Char`](Token::Char)`==u8` and [`CS`](Token::CS)`==`[`InternedCSName`]
214 as a single `u32` (similar to the way plain TeX does it) -- i.e. it is small and `Copy`, which yields a significant
215 performance improvement in the most performance critical parts of the code.
216
217Values up to `0x8000_0000` are interpreted as interned control sequences, and the rest as character tokens. The downside
218is that we need an interning table for control sequences, that needs passing around whenever we want to
219make a Token from a control sequence name or display a [`CompactToken`] to the user in a comprehensible way.
220
221(Also, we can only have 2³¹ control sequences in total, but that limit is ridiculously large.)
222*/
223#[derive(Clone, Copy, Eq, Debug)]
224pub struct CompactToken(NonZeroU32);
225impl CompactToken {
226    fn is_string(&self) -> bool {
227        self.0.get() < 0x8000_0000
228    }
229
230    fn as_string(&self) -> Option<InternedCSName<u8>> {
231        if self.is_string() {
232            Some((self.0, PhantomData))
233            //Some(InternedString::try_from_usize(self.0 as usize).unwrap())
234        } else {
235            None
236        }
237    }
238
239    fn commandcode_value(&self) -> u8 {
240        ((self.0.get() & 0x00FF_0000) >> 16) as u8
241    }
242
243    fn code(&self) -> CommandCode {
244        CommandCode::try_from(self.commandcode_value()).unwrap()
245    }
246
247    fn u8(&self) -> u8 {
248        (self.0.get() & 0x0000_00FF) as u8
249    }
250}
251impl PartialEq for CompactToken {
252    fn eq(&self, other: &Self) -> bool {
253        self.0 == other.0 || {
254            if self.is_string() || other.is_string() {
255                return false;
256            }
257            let cc1 = self.code();
258            let cc2 = other.code();
259            if cc1 == CommandCode::Space && cc2 == CommandCode::Space {
260                return true;
261            }
262            if cc1 != cc2 {
263                return false;
264            }
265            self.u8() == other.u8()
266        }
267    }
268}
269impl Token for CompactToken {
270    type CS = InternedCSName<u8>; //InternedString;
271    type Char = u8;
272    //const TOKEN_LIST_FACTORY: Option<RefCell<ReusableVectorFactory<Self>>> = Some(RefCell::new(ReusableVectorFactory::constant()));
273    fn to_enum(&self) -> StandardToken<u8, InternedCSName<u8>> {
274        match self.as_string() {
275            Some(s) => StandardToken::ControlSequence(s),
276            None => match self.is_primitive() {
277                Some(i) => StandardToken::Primitive(i),
278                None => StandardToken::Character(self.u8(), self.code()),
279            },
280        }
281    }
282
283    fn from_cs(cs: Self::CS) -> Self {
284        Self(cs.0)
285    }
286
287    fn from_char_cat(c: u8, cat: CommandCode) -> Self {
288        Self(NonZeroU32::new(0x8000_0000 | ((cat.as_byte() as u32) << 16) | (c as u32)).unwrap())
289    }
290
291    fn space() -> Self {
292        Self::from_char_cat(32, CommandCode::Space)
293    }
294
295    fn eof() -> Self {
296        Self::from_char_cat(0, CommandCode::EOF)
297    }
298
299    fn primitive(id: PrimitiveIdentifier) -> Self {
300        Self(
301            NonZeroU32::new(
302                0x8000_0000
303                    | ((CommandCode::Primitive.as_byte() as u32) << 16)
304                    | (id.as_u16() as u32),
305            )
306            .unwrap(),
307        )
308    }
309    fn is_primitive(&self) -> Option<PrimitiveIdentifier> {
310        if !self.is_string()
311            && (((self.0.get() & 0x00FF_0000) >> 16) as u8) == CommandCode::Primitive.as_byte()
312        {
313            PrimitiveIdentifier::try_from_u16((self.0.get() & 0x0000_FFFF) as u16)
314        } else {
315            None
316        }
317    }
318
319    fn argument_marker(i: u8) -> Self {
320        Self::from_char_cat(i, CommandCode::Argument)
321    }
322
323    fn command_code(&self) -> CommandCode {
324        if self.is_string() {
325            CommandCode::Escape
326        } else {
327            self.code()
328        }
329    }
330
331    fn char_value(&self) -> Option<Self::Char> {
332        if self.is_string() {
333            None
334        } else {
335            Some(self.u8())
336        }
337    }
338
339    fn is_cs_or_active(&self) -> bool {
340        self.is_string() || {
341            let cc = ((self.0.get() & 0x00FF_0000) >> 16) as u8;
342            cc == CommandCode::Active.as_byte() || cc == CommandCode::Primitive.as_byte()
343        }
344    }
345
346    fn is_cs(&self, name: &Self::CS) -> bool {
347        self.0 == name.0
348    }
349
350    fn is_argument_marker(&self) -> Option<u8> {
351        if !self.is_string()
352            && (((self.0.get() & 0x00FF_0000) >> 16) as u8) == CommandCode::Argument.as_byte()
353        {
354            Some(self.u8())
355        } else {
356            None
357        }
358    }
359}