1use crate::prelude::*;
2use crate::tex::characters::{TextLine, TextLineSource};
3use crate::utils::errors::InvalidCharacter;
4
5#[derive(Copy, Clone, PartialEq, Eq, Debug)]
7pub enum MouthState {
8 NewLine,
10 SkipBlank,
12 MidLine,
14}
15
16#[derive(Clone, Debug)]
59pub struct InputTokenizer<C: Character, S: TextLineSource<C>> {
60 pub state: MouthState,
61 line: usize,
62 col: usize,
63 current_line: TextLine<C>,
64 pub source: S,
65 pub(crate) eof: bool,
66 tempstr: Vec<C>,
67}
68
69type Csh<T> = <<T as Token>::CS as CSName<<T as Token>::Char>>::Handler;
70
71impl<C: Character, S: TextLineSource<C>> InputTokenizer<C, S> {
72 pub fn new(mut source: S) -> Self {
74 Self {
75 state: MouthState::NewLine,
76 line: 1,
77 col: 0,
78 current_line: source.get_line().unwrap_or(TextLine::default()),
79 source,
80 eof: false,
81 tempstr: Vec::new(),
82 }
83 }
84 pub fn line(&self) -> usize {
87 self.line
88 }
89 pub fn column(&self) -> usize {
92 self.col + 1
93 }
94 pub fn eof(&self) -> bool {
97 self.eof
98 }
99
100 fn get_char(&mut self) -> Option<C> {
101 if self.col >= self.current_line.len() {
102 None
103 } else {
104 let next = self.current_line[self.col];
105 self.col += 1;
106 Some(next)
107 }
108 }
109
110 pub fn readline<T: Token<Char = C>, F: FnMut(T)>(&mut self, mut f: F) {
113 while self.col < self.current_line.len() {
114 let next = self.current_line[self.col];
115 self.col += 1;
116 match next.try_into() {
117 Ok(b' ') => f(T::space()),
118 _ => f(T::from_char_cat(next, CommandCode::Other)),
119 }
120 }
121 self.next_line();
122 }
123
124 pub fn read<T: Token<Char = C>, F: FnMut(T)>(
128 &mut self,
129 handler: &mut Csh<T>,
130 cc: &CategoryCodeScheme<C>,
131 endline: Option<C>,
132 mut f: F,
133 ) -> Result<(), InvalidCharacter<C>> {
134 let mut ingroups = 0;
135 let mut ret: Result<(), InvalidCharacter<C>> = Ok(());
136 let line = self.line;
137 while self.line == line || ingroups > 0 {
138 match self.get_char() {
139 None => {
140 if self.eof {
141 return ret;
142 }
143 if let Some(n) = self.return_endline::<T>(cc, endline, handler.par()) {
144 f(n)
145 }
146 return ret;
147 }
148 Some(c) => match self.check_char::<T>(handler, cc, endline, c) {
149 Ok(None) if self.line == line || ingroups > 0 => (),
150 Ok(None) => return ret,
151 Ok(Some(tk)) => {
152 if tk.command_code() == CommandCode::BeginGroup {
153 ingroups += 1
154 } else if tk.command_code() == CommandCode::EndGroup {
155 ingroups -= 1
156 }
157 f(tk)
158 }
159 Err(i) => {
160 f(T::from_char_cat(i.0, CommandCode::Other));
161 ret = Err(i)
162 }
163 },
164 }
165 }
166 ret
167 }
168
169 pub fn get_next<T: Token<Char = C>>(
172 &mut self,
173 handler: &mut Csh<T>,
174 cc: &CategoryCodeScheme<C>,
175 endline: Option<C>,
176 ) -> Result<Option<T>, InvalidCharacter<C>> {
177 loop {
178 match self.get_char() {
179 None if self.eof => return Ok(None),
180 None => {
181 if let Some(e) = self.return_endline::<T>(cc, endline, handler.par()) {
182 return Ok(Some(e));
184 }
185 }
186 Some(c) => {
187 if let Some(t) = self.check_char::<T>(handler, cc, endline, c)? {
188 return Ok(Some(t));
189 }
190 }
191 };
192 }
193 }
194
195 fn check_char<T: Token<Char = C>>(
196 &mut self,
197 handler: &mut Csh<T>,
198 cc: &CategoryCodeScheme<C>,
199 endline: Option<C>,
200 c: C,
201 ) -> Result<Option<T>, InvalidCharacter<C>> {
202 use CategoryCode::*;
203 match cc.get(c) {
204 EOL if self.state == MouthState::NewLine => {
205 self.next_line();
206 Ok(Some(self.do_par(handler.par())))
207 }
208 EOL => Ok(self.return_endline::<T>(cc, endline, handler.par())),
209 Space if self.state == MouthState::SkipBlank => Ok(None),
210 Space if self.state == MouthState::NewLine => Ok(None),
211 Space => {
212 self.state = MouthState::SkipBlank;
213 Ok(Some(T::space()))
214 }
215 Ignored => Ok(None),
216 Comment => {
217 self.next_line();
218 self.state = MouthState::NewLine;
219 Ok(None)
220 }
221 Invalid => Err(InvalidCharacter(c)),
222 Escape => Ok(Some(self.get_escape::<T>(handler, cc, endline))),
223 Superscript => match self.maybe_superscript(c) {
224 Some(c) => self.check_char::<T>(handler, cc, endline, c),
225 None => {
226 self.state = MouthState::MidLine;
227 Ok(Some(T::from_char_cat(c, CommandCode::Superscript)))
228 }
229 },
230 cc => {
231 self.state = MouthState::MidLine;
232 Ok(Some(T::from_char_cat(c, (*cc).into())))
233 }
234 }
235 }
236
237 fn next_line(&mut self) {
238 if let Some(next) = self.source.get_line() {
239 self.current_line = next;
240 self.line += 1;
241 self.col = 0;
242 } else {
243 self.eof = true;
244 self.col = self.current_line.len();
245 self.state = MouthState::MidLine;
246 }
247 }
248
249 fn do_par<T: Token<Char = C>>(&mut self, par: T::CS) -> T {
250 if self.current_line.is_empty() {
251 while let Some(line) = self.source.get_line() {
252 self.line += 1;
253 if !line.is_empty() {
254 self.current_line = line;
255 break;
256 }
257 }
258 }
259 T::from_cs(par)
260 }
261
262 fn return_endline<T: Token<Char = C>>(
263 &mut self,
264 cc: &CategoryCodeScheme<C>,
265 endline: Option<C>,
266 par: T::CS,
267 ) -> Option<T> {
268 use CategoryCode::*;
269 self.next_line();
270 let ret = match endline {
271 None => None,
272 Some(c) => match cc.get(c) {
273 Space | EOL if self.state == MouthState::SkipBlank => None,
274 Space if self.state == MouthState::NewLine => None,
275 EOL if self.state == MouthState::NewLine => Some(self.do_par(par)),
276 EOL => Some(T::space()),
277 Ignored | Invalid | Comment => None,
278 o => Some(T::from_char_cat(c, (*o).into())),
279 },
280 };
281 self.state = MouthState::NewLine;
282 ret
283 }
284
285 fn get_escape<T: Token<Char = C>>(
286 &mut self,
287 handler: &mut Csh<T>,
288 cc: &CategoryCodeScheme<C>,
289 endline: Option<C>,
290 ) -> T {
291 let name = match self.get_char() {
292 None => {
293 self.next_line();
294 match endline {
295 None => handler.empty_str(),
296 Some(c) => {
297 self.tempstr.clear();
298 self.tempstr.push(c);
299 handler.cs_from_chars(&self.tempstr)
300 }
301 }
302 }
303 Some(next) => self.check_escape::<T>(handler, cc, next),
304 };
305 T::from_cs(name)
306 }
307
308 fn check_escape<T: Token<Char = C>>(
309 &mut self,
310 handler: &mut Csh<T>,
311 cc: &CategoryCodeScheme<C>,
312 next: C,
313 ) -> T::CS {
314 use CategoryCode::*;
315 match cc.get(next) {
316 Superscript => match self.maybe_superscript(next) {
317 Some(c) => self.check_escape::<T>(handler, cc, c),
318 None => {
319 self.state = MouthState::MidLine;
320 self.tempstr.clear();
321 self.tempstr.push(next);
322 handler.cs_from_chars(&self.tempstr)
323 }
324 },
325 Letter => self.get_cs_name::<T>(handler, cc, next),
326 _ => {
327 self.state = MouthState::MidLine;
328 self.tempstr.clear();
329 self.tempstr.push(next);
330 handler.cs_from_chars(&self.tempstr)
331 }
332 }
333 }
334
335 fn get_cs_name<T: Token<Char = C>>(
336 &mut self,
337 handler: &mut Csh<T>,
338 cc: &CategoryCodeScheme<C>,
339 first: C,
340 ) -> T::CS {
341 self.tempstr.clear();
342 self.tempstr.push(first);
343 self.state = MouthState::SkipBlank;
344 loop {
345 match self.get_char() {
346 None => break,
347 Some(next) => match cc.get(next) {
348 CategoryCode::Letter => self.tempstr.push(next),
349 CategoryCode::Superscript => {
350 let curr = self.col;
351 match self.maybe_superscript(next) {
352 Some(c) if *cc.get(c) == CategoryCode::Letter => self.tempstr.push(c),
353 _ => {
354 self.col = curr;
355 self.col -= 1;
356 break;
357 }
358 }
359 }
360 _ => {
361 self.col -= 1;
362 break;
363 }
364 },
365 }
366 }
367 handler.cs_from_chars(&self.tempstr)
368 }
369
370 fn cond(i: C) -> bool {
371 (Into::<C>::into(48u8) <= i && i <= Into::<C>::into(57u8))
372 || (Into::<C>::into(97u8) <= i && i <= Into::<C>::into(102u8))
373 }
374
375 fn maybe_superscript(&mut self, firstsup: C) -> Option<C> {
376 match self.get_char() {
377 None => None,
378 Some(c) if c != firstsup => {
379 self.col -= 1;
380 None
381 }
382 Some(_) => match self.get_char() {
383 None => {
384 self.col -= 1;
385 None
386 }
387 Some(first) => match self.get_char() {
388 None => {
389 if first < (128).into() {
390 let u: u8 = match first.try_into() {
391 Ok(u) => u,
392 Err(_) => {
393 self.col -= 1;
394 return None;
395 }
396 };
397 let ch: C = (if u < 64 { u + 64 } else { u - 64 }).into();
398 Some(ch)
399 } else {
400 self.col -= 2;
401 None
402 }
403 }
404 Some(second) => {
405 if Self::cond(first) && Self::cond(second) {
406 let ufirst: u8 = match first.try_into() {
407 Ok(u) => u,
408 Err(_) => {
409 self.col -= 2;
410 return None;
411 }
412 };
413 let usecond: u8 = match second.try_into() {
414 Ok(u) => u,
415 Err(_) => {
416 self.col -= 2;
417 return None;
418 }
419 };
420 let char = u8::from_str_radix(
421 std::str::from_utf8(&[ufirst, usecond]).unwrap(),
422 16,
423 )
424 .unwrap();
425 Some(char.into())
426 } else {
427 self.col -= 1;
428 if first < (128).into() {
429 let u: u8 = match first.try_into() {
430 Ok(u) => u,
431 Err(_) => {
432 self.col -= 2;
433 return None;
434 }
435 };
436 let ch: C = (if u < 64 { u + 64 } else { u - 64 }).into();
437 Some(ch)
438 } else {
439 self.col -= 2;
440 None
441 }
442 }
443 }
444 },
445 },
446 }
447 }
448
449 pub fn preview<W: std::fmt::Write>(&self, len: &mut usize, mut f: W) -> std::fmt::Result {
451 if self.current_line.is_empty() {
452 return Ok(());
453 }
454 if self.current_line.len() > self.col {
455 for c in &self.current_line[self.col..] {
456 *len -= 1;
457 c.display_fmt(&mut f);
458 if *len == 0 {
459 return Ok(());
460 }
461 }
462 }
463 Ok(())
464 }
465}