1
//! Break a string into a set of directory-object Items.
2
//!
3
//! This module defines Item, which represents a basic entry in a
4
//! directory document, and NetDocReader, which is used to break a
5
//! string into Items.
6

            
7
use crate::parse::keyword::Keyword;
8
use crate::types::misc::FromBytes;
9
use crate::util::PeekableIterator;
10
use crate::{Error, NetdocErrorKind as EK, Pos, Result};
11
use base64ct::{Base64, Encoding};
12
use itertools::Itertools;
13
use std::cell::{Ref, RefCell};
14
use std::iter::Peekable;
15
use std::str::FromStr;
16
use tor_error::internal;
17

            
18
/// Useful constants for netdoc object syntax
19
pub(crate) mod object {
20
    /// indicates the start of an object
21
    pub(crate) const BEGIN_STR: &str = "-----BEGIN ";
22
    /// indicates the end of an object
23
    pub(crate) const END_STR: &str = "-----END ";
24
    /// indicates the end of a begin or end tag.
25
    pub(crate) const TAG_END: &str = "-----";
26
    /// Maximum PEM base64 line length (not enforced during parsing)
27
    pub(crate) const BASE64_PEM_MAX_LINE: usize = 64;
28
}
29

            
30
/// Return true iff a given character is "space" according to the rules
31
/// of dir-spec.txt
32
39034458
pub(crate) fn is_sp(c: char) -> bool {
33
39034458
    c == ' ' || c == '\t'
34
39034458
}
35
/// Check that all the characters in `s` are valid base64.
36
///
37
/// This is not a perfect check for base64ness -- it is mainly meant
38
/// to help us recover after unterminated base64.
39
257606
fn b64check(s: &str) -> Result<()> {
40
16106338
    for b in s.bytes() {
41
16106338
        match b {
42
24640
            b'=' => (),
43
6495918
            b'a'..=b'z' => (),
44
6635058
            b'A'..=b'Z' => (),
45
2467354
            b'0'..=b'9' => (),
46
483362
            b'/' | b'+' => (),
47
            _ => {
48
6
                return Err(EK::BadObjectBase64.at_pos(Pos::at(s)));
49
            }
50
        };
51
    }
52
257600
    Ok(())
53
257606
}
54

            
55
/// A tagged object that is part of a directory Item.
56
///
57
/// This represents a single blob within a pair of "-----BEGIN
58
/// FOO-----" and "-----END FOO-----".  The data is not guaranteed to
59
/// be actual base64 when this object is created: doing so would
60
/// require either that we parse the base64 twice, or that we allocate
61
/// a buffer to hold the data before it's needed.
62
#[derive(Clone, Copy, Debug)]
63
pub(crate) struct Object<'a> {
64
    /// Reference to the "tag" string (the 'foo') in the BEGIN line.
65
    tag: &'a str,
66
    /// Reference to the allegedly base64-encoded data.  This may or
67
    /// may not actually be base64 at this point.
68
    data: &'a str,
69
    /// Reference to the END line for this object.  This doesn't
70
    /// need to be parsed, but it's used to find where this object
71
    /// ends.
72
    endline: &'a str,
73
}
74

            
75
/// A single part of a directory object.
76
///
77
/// Each Item -- called an "entry" in dir-spec.txt -- has a keyword, a
78
/// (possibly empty) set of arguments, and an optional object.
79
///
80
/// This is a zero-copy implementation that points to slices within a
81
/// containing string.
82
#[derive(Clone, Debug)]
83
pub(crate) struct Item<'a, K: Keyword> {
84
    /// The keyword that determines the type of this item.
85
    kwd: K,
86
    /// A reference to the actual string that defines the keyword for
87
    /// this item.
88
    kwd_str: &'a str,
89
    /// Reference to the arguments that appear in the same line after the
90
    /// keyword.  Does not include the terminating newline or the
91
    /// space that separates the keyword for its arguments.
92
    args: &'a str,
93
    /// The arguments, split by whitespace.  This vector is constructed
94
    /// as needed, using interior mutability.
95
    split_args: RefCell<Option<Vec<&'a str>>>,
96
    /// If present, a base-64-encoded object that appeared at the end
97
    /// of this item.
98
    object: Option<Object<'a>>,
99
}
100

            
101
/// A cursor into a string that returns Items one by one.
102
///
103
/// (This type isn't used directly, but is returned wrapped in a Peekable.)
104
#[derive(Debug)]
105
struct NetDocReaderBase<'a, K: Keyword> {
106
    /// The string we're parsing.
107
    s: &'a str,
108
    /// Our position within the string.
109
    off: usize,
110
    /// Tells Rust it's okay that we are parameterizing on K.
111
    _k: std::marker::PhantomData<K>,
112
}
113

            
114
impl<'a, K: Keyword> NetDocReaderBase<'a, K> {
115
    /// Create a new NetDocReader to split a string into tokens.
116
5448
    fn new(s: &'a str) -> Result<Self> {
117
        Ok(NetDocReaderBase {
118
5448
            s: validate_utf_8_rules(s)?,
119
            off: 0,
120
5448
            _k: std::marker::PhantomData,
121
        })
122
5448
    }
123
    /// Return the current Pos within the string.
124
210
    fn pos(&self, pos: usize) -> Pos {
125
210
        Pos::from_offset(self.s, pos)
126
210
    }
127
    /// Skip forward by n bytes.
128
    ///
129
    /// (Note that standard caveats with byte-oriented processing of
130
    /// UTF-8 strings apply.)
131
408502
    fn advance(&mut self, n: usize) -> Result<()> {
132
408502
        if n > self.remaining() {
133
            return Err(
134
                Error::from(internal!("tried to advance past end of document"))
135
                    .at_pos(Pos::from_offset(self.s, self.off)),
136
            );
137
408502
        }
138
408502
        self.off += n;
139
408502
        Ok(())
140
408502
    }
141
    /// Return the remaining number of bytes in this reader.
142
519052
    fn remaining(&self) -> usize {
143
519052
        self.s.len() - self.off
144
519052
    }
145

            
146
    /// Return true if the next characters in this reader are `s`
147
    #[allow(clippy::string_slice)] // TODO
148
105044
    fn starts_with(&self, s: &str) -> bool {
149
105044
        self.s[self.off..].starts_with(s)
150
105044
    }
151
    /// Try to extract a NL-terminated line from this reader.  Always
152
    /// remove data if the reader is nonempty.
153
    #[allow(clippy::string_slice)] // TODO
154
408502
    fn line(&mut self) -> Result<&'a str> {
155
408502
        let remainder = &self.s[self.off..];
156
408502
        if let Some(nl_pos) = remainder.find('\n') {
157
408338
            self.advance(nl_pos + 1)?;
158
408338
            let line = &remainder[..nl_pos];
159

            
160
            // TODO: we should probably detect \r and do something about it.
161
            // Just ignoring it isn't the right answer, though.
162
408338
            Ok(line)
163
        } else {
164
164
            self.advance(remainder.len())?; // drain everything.
165
164
            Err(EK::TruncatedLine.at_pos(self.pos(self.s.len())))
166
        }
167
408502
    }
168

            
169
    /// Try to extract a line that begins with a keyword from this reader.
170
    ///
171
    /// Returns a (kwd, args) tuple on success.
172
    #[allow(clippy::string_slice)] // TODO
173
105246
    fn kwdline(&mut self) -> Result<(&'a str, &'a str)> {
174
105246
        let pos = self.off;
175
105246
        let line = self.line()?;
176
105082
        if line.is_empty() {
177
14
            return Err(EK::EmptyLine.at_pos(self.pos(pos)));
178
105068
        }
179
105068
        let (line, anno_ok) = if let Some(rem) = line.strip_prefix("opt ") {
180
4
            (rem, false)
181
        } else {
182
105064
            (line, true)
183
        };
184
105068
        let mut parts_iter = line.splitn(2, [' ', '\t']);
185
105068
        let kwd = match parts_iter.next() {
186
105068
            Some(k) => k,
187
            // This case seems like it can't happen: split always returns
188
            // something, apparently.
189
            None => return Err(EK::MissingKeyword.at_pos(self.pos(pos))),
190
        };
191
105068
        if !keyword_ok(kwd, anno_ok) {
192
24
            return Err(EK::BadKeyword.at_pos(self.pos(pos)));
193
105044
        }
194
        // TODO(nickm): dir-spec does not yet allow unicode in the arguments, but we're
195
        // assuming that proposal 285 is accepted.
196
105044
        let args = match parts_iter.next() {
197
82948
            Some(a) => a,
198
            // take a zero-length slice, so it will be within the string.
199
22096
            None => &kwd[kwd.len()..],
200
        };
201
105044
        Ok((kwd, args))
202
105246
    }
203

            
204
    /// Try to extract an Object beginning wrapped within BEGIN/END tags.
205
    ///
206
    /// Returns Ok(Some(Object(...))) on success if an object is
207
    /// found, Ok(None) if no object is found, and Err only if a
208
    /// corrupt object is found.
209
    #[allow(clippy::string_slice)] // TODO
210
105044
    fn object(&mut self) -> Result<Option<Object<'a>>> {
211
        use object::*;
212

            
213
105044
        let pos = self.off;
214
105044
        if !self.starts_with(BEGIN_STR) {
215
82214
            return Ok(None);
216
22830
        }
217
22830
        let line = self.line()?;
218
22830
        if !line.ends_with(TAG_END) {
219
2
            return Err(EK::BadObjectBeginTag.at_pos(self.pos(pos)));
220
22828
        }
221
22828
        let tag = &line[BEGIN_STR.len()..(line.len() - TAG_END.len())];
222
22828
        if !tag_keywords_ok(tag) {
223
2
            return Err(EK::BadObjectBeginTag.at_pos(self.pos(pos)));
224
22826
        }
225
22826
        let datapos = self.off;
226
22820
        let (endlinepos, endline) = loop {
227
280426
            let p = self.off;
228
280426
            let line = self.line()?;
229
280426
            if line.starts_with(END_STR) {
230
22820
                break (p, line);
231
257606
            }
232
            // Exit if this line isn't plausible base64.  Otherwise,
233
            // an unterminated base64 block could potentially
234
            // "consume" all the rest of the string, which would stop
235
            // us from recovering.
236
257606
            b64check(line).map_err(|e| e.within(self.s))?;
237
        };
238
22820
        let data = &self.s[datapos..endlinepos];
239
22820
        if !endline.ends_with(TAG_END) {
240
2
            return Err(EK::BadObjectEndTag.at_pos(self.pos(endlinepos)));
241
22818
        }
242
22818
        let endtag = &endline[END_STR.len()..(endline.len() - TAG_END.len())];
243
22818
        if endtag != tag {
244
2
            return Err(EK::BadObjectMismatchedTag.at_pos(self.pos(endlinepos)));
245
22816
        }
246
22816
        Ok(Some(Object { tag, data, endline }))
247
105044
    }
248

            
249
    /// Read the next Item from this NetDocReaderBase.
250
    ///
251
    /// If successful, returns Ok(Some(Item)), or Ok(None) if exhausted.
252
    /// Returns Err on failure.
253
    ///
254
    /// Always consumes at least one line if possible; always ends on a
255
    /// line boundary if one exists.
256
110550
    fn item(&mut self) -> Result<Option<Item<'a, K>>> {
257
110550
        if self.remaining() == 0 {
258
5304
            return Ok(None);
259
105246
        }
260
105246
        let (kwd_str, args) = self.kwdline()?;
261
105044
        let object = self.object()?;
262
105030
        let split_args = RefCell::new(None);
263
105030
        let kwd = K::from_str(kwd_str);
264
105030
        Ok(Some(Item {
265
105030
            kwd,
266
105030
            kwd_str,
267
105030
            args,
268
105030
            split_args,
269
105030
            object,
270
105030
        }))
271
110550
    }
272
}
273

            
274
/// Return true iff 's' is a valid keyword or annotation.
275
///
276
/// (Only allow annotations if `anno_ok` is true.`
277
#[allow(clippy::string_slice)] // TODO
278
211162
fn keyword_ok(mut s: &str, anno_ok: bool) -> bool {
279
    /// Helper: return true if this character can appear in keywords.
280
1748918
    fn kwd_char_ok(c: char) -> bool {
281
1748918
        matches!(c,'A'..='Z' | 'a'..='z' |'0'..='9' | '-')
282
1748918
    }
283

            
284
211162
    if s.is_empty() {
285
6
        return false;
286
211156
    }
287
211156
    if anno_ok && s.starts_with('@') {
288
30
        s = &s[1..];
289
211126
    }
290
211156
    if s.starts_with('-') {
291
8
        return false;
292
211148
    }
293
211148
    s.chars().all(kwd_char_ok)
294
211162
}
295

            
296
/// Return true iff 's' is a valid keywords string for a BEGIN/END tag.
297
57930
pub(crate) fn tag_keywords_ok(s: &str) -> bool {
298
107551
    s.split(' ').all(|w| keyword_ok(w, false))
299
57930
}
300

            
301
/// When used as an Iterator, returns a sequence of `Result<Item>`.
302
impl<'a, K: Keyword> Iterator for NetDocReaderBase<'a, K> {
303
    type Item = Result<Item<'a, K>>;
304
110550
    fn next(&mut self) -> Option<Self::Item> {
305
110550
        self.item().transpose()
306
110550
    }
307
}
308

            
309
/// Helper: as base64::decode(), but allows newlines in the middle of the
310
/// encoded object.
311
28576
pub(crate) fn base64_decode_multiline(s: &str) -> std::result::Result<Vec<u8>, base64ct::Error> {
312
    // base64 module hates whitespace.
313
28576
    let mut s = s.to_string();
314
18736254
    s.retain(|ch| ch != '\n');
315
28576
    let v = Base64::decode_vec(&s)?;
316
28574
    Ok(v)
317
28576
}
318

            
319
impl<'a, K: Keyword> Item<'a, K> {
320
    /// Return the parsed keyword part of this item.
321
176708
    pub(crate) fn kwd(&self) -> K {
322
176708
        self.kwd
323
176708
    }
324
    /// Return the keyword part of this item, as a string.
325
2756
    pub(crate) fn kwd_str(&self) -> &'a str {
326
2756
        self.kwd_str
327
2756
    }
328
    /// Return true if the keyword for this item is in 'ks'.
329
77326
    pub(crate) fn has_kwd_in(&self, ks: &[K]) -> bool {
330
77326
        ks.contains(&self.kwd)
331
77326
    }
332
    /// Return the arguments of this item, as a single string.
333
23752
    pub(crate) fn args_as_str(&self) -> &'a str {
334
23752
        self.args
335
23752
    }
336
    /// Return the arguments of this item as a vector.
337
104524
    fn args_as_vec(&self) -> Ref<'_, Vec<&'a str>> {
338
        // We're using an interior mutability pattern here to lazily
339
        // construct the vector.
340
104524
        if self.split_args.borrow().is_none() {
341
48236
            self.split_args.replace(Some(self.args().collect()));
342
56288
        }
343
104524
        Ref::map(self.split_args.borrow(), |opt| match opt {
344
104524
            Some(v) => v,
345
            None => panic!(),
346
104524
        })
347
104524
    }
348
    /// Return an iterator over the arguments of this item.
349
156028
    pub(crate) fn args(&self) -> impl Iterator<Item = &'a str> + use<'a, K> {
350
416794
        self.args.split(is_sp).filter(|s| !s.is_empty())
351
156028
    }
352
    /// Return the nth argument of this item, if there is one.
353
104516
    pub(crate) fn arg(&self, idx: usize) -> Option<&'a str> {
354
104516
        self.args_as_vec().get(idx).copied()
355
104516
    }
356
    /// Return the nth argument of this item, or an error if it isn't there.
357
28154
    pub(crate) fn required_arg(&self, idx: usize) -> Result<&'a str> {
358
28154
        self.arg(idx)
359
28154
            .ok_or_else(|| EK::MissingArgument.at_pos(Pos::at(self.args)))
360
28154
    }
361
    /// Try to parse the nth argument (if it exists) into some type
362
    /// that supports FromStr.
363
    ///
364
    /// Returns Ok(None) if the argument doesn't exist.
365
70652
    pub(crate) fn parse_optional_arg<V: FromStr>(&self, idx: usize) -> Result<Option<V>>
366
70652
    where
367
70652
        Error: From<V::Err>,
368
    {
369
70652
        match self.arg(idx) {
370
6
            None => Ok(None),
371
70646
            Some(s) => match s.parse() {
372
70642
                Ok(r) => Ok(Some(r)),
373
4
                Err(e) => {
374
4
                    let e: Error = e.into();
375
4
                    Err(e.or_at_pos(Pos::at(s)))
376
                }
377
            },
378
        }
379
70652
    }
380
    /// Try to parse the nth argument (if it exists) into some type
381
    /// that supports FromStr.
382
    ///
383
    /// Return an error if the argument doesn't exist.
384
70644
    pub(crate) fn parse_arg<V: FromStr>(&self, idx: usize) -> Result<V>
385
70644
    where
386
70644
        Error: From<V::Err>,
387
    {
388
70644
        match self.parse_optional_arg(idx) {
389
70638
            Ok(Some(v)) => Ok(v),
390
2
            Ok(None) => Err(EK::MissingArgument.at_pos(self.arg_pos(idx))),
391
4
            Err(e) => Err(e),
392
        }
393
70644
    }
394
    /// Return the number of arguments for this Item
395
104942
    pub(crate) fn n_args(&self) -> usize {
396
104942
        self.args().count()
397
104942
    }
398
    /// Return true iff this Item has an associated object.
399
104256
    pub(crate) fn has_obj(&self) -> bool {
400
104256
        self.object.is_some()
401
104256
    }
402
    /// Return the tag of this item's associated object, if it has one.
403
190
    pub(crate) fn obj_tag(&self) -> Option<&'a str> {
404
190
        self.object.map(|o| o.tag)
405
190
    }
406
    /// Try to decode the base64 contents of this Item's associated object.
407
    ///
408
    /// On success, return the object's tag and decoded contents.
409
25466
    pub(crate) fn obj_raw(&self) -> Result<Option<(&'a str, Vec<u8>)>> {
410
25466
        match self.object {
411
2774
            None => Ok(None),
412
22692
            Some(obj) => {
413
22692
                let decoded = base64_decode_multiline(obj.data)
414
22692
                    .map_err(|_| EK::BadObjectBase64.at_pos(Pos::at(obj.data)))?;
415
22692
                Ok(Some((obj.tag, decoded)))
416
            }
417
        }
418
25466
    }
419
    /// Try to decode the base64 contents of this Item's associated object,
420
    /// and make sure that its tag matches 'want_tag'.
421
22694
    pub(crate) fn obj(&self, want_tag: &str) -> Result<Vec<u8>> {
422
22694
        match self.obj_raw()? {
423
2
            None => Err(EK::MissingObject
424
2
                .with_msg(self.kwd.to_str())
425
2
                .at_pos(self.end_pos())),
426
22692
            Some((tag, decoded)) => {
427
22692
                if tag != want_tag {
428
4
                    Err(EK::WrongObject.at_pos(Pos::at(tag)))
429
                } else {
430
22688
                    Ok(decoded)
431
                }
432
            }
433
        }
434
22694
    }
435
    /// Try to decode the base64 contents of this item's associated object
436
    /// as a given type that implements FromBytes.
437
15164
    pub(crate) fn parse_obj<V: FromBytes>(&self, want_tag: &str) -> Result<V> {
438
15164
        let bytes = self.obj(want_tag)?;
439
        // Unwrap may be safe because above `.obj()` should return an Error if
440
        // wanted tag was not present
441
        #[allow(clippy::unwrap_used)]
442
15164
        let p = Pos::at(self.object.unwrap().data);
443
15164
        V::from_vec(bytes, p).map_err(|e| e.at_pos(p))
444
15164
    }
445
    /// Return the position of this item.
446
    ///
447
    /// This position won't be useful unless it is later contextualized
448
    /// with the containing string.
449
4960
    pub(crate) fn pos(&self) -> Pos {
450
4960
        Pos::at(self.kwd_str)
451
4960
    }
452
    /// Return the position of this Item in a string.
453
    ///
454
    /// Returns None if this item doesn't actually belong to the string.
455
10318
    pub(crate) fn offset_in(&self, s: &str) -> Option<usize> {
456
10318
        crate::util::str::str_offset(s, self.kwd_str)
457
10318
    }
458
    /// Return the position of the n'th argument of this item.
459
    ///
460
    /// If this item does not have a n'th argument, return the
461
    /// position of the end of the final argument.
462
8
    pub(crate) fn arg_pos(&self, n: usize) -> Pos {
463
8
        let args = self.args_as_vec();
464
8
        if n < args.len() {
465
6
            Pos::at(args[n])
466
        } else {
467
2
            self.last_arg_end_pos()
468
        }
469
8
    }
470
    /// Return the position at the end of the last argument.  (This will
471
    /// point to a newline.)
472
1186
    fn last_arg_end_pos(&self) -> Pos {
473
1186
        Pos::at_end_of(self.args)
474
1186
    }
475
    /// Return the position of the end of this object. (This will point to a
476
    /// newline.)
477
1378
    pub(crate) fn end_pos(&self) -> Pos {
478
1378
        match self.object {
479
196
            Some(o) => Pos::at_end_of(o.endline),
480
1182
            None => self.last_arg_end_pos(),
481
        }
482
1378
    }
483
    /// If this item occurs within s, return the byte offset
484
    /// immediately after the end of this item.
485
450
    pub(crate) fn offset_after(&self, s: &str) -> Option<usize> {
486
450
        self.end_pos().offset_within(s).map(|nl_pos| nl_pos + 1)
487
450
    }
488

            
489
    /// Return the text of this item, if it originated within `str`,
490
    /// from the start of its keyword up to and including its final newline.
491
    #[allow(dead_code)] // unused when hsdesc not enabled.
492
726
    pub(crate) fn text_within<'b>(&self, s: &'b str) -> Option<&'b str> {
493
726
        let start = self.pos().offset_within(s)?;
494
726
        let end = self.end_pos().offset_within(s)?;
495
726
        s.get(start..=end)
496
726
    }
497
}
498

            
499
/// Represents an Item that might not be present, whose arguments we
500
/// want to inspect.  If the Item is there, this acts like a proxy to the
501
/// item; otherwise, it treats the item as having no arguments.
502
pub(crate) struct MaybeItem<'a, 'b, K: Keyword>(Option<&'a Item<'b, K>>);
503

            
504
// All methods here are as for Item.
505
impl<'a, 'b, K: Keyword> MaybeItem<'a, 'b, K> {
506
    /// Return the position of this item, if it has one.
507
6
    pub(crate) fn pos(&self) -> Pos {
508
6
        match self.0 {
509
6
            Some(item) => item.pos(),
510
            None => Pos::None,
511
        }
512
6
    }
513
    /// Construct a MaybeItem from an Option reference to an item.
514
12562
    pub(crate) fn from_option(opt: Option<&'a Item<'b, K>>) -> Self {
515
12562
        MaybeItem(opt)
516
12562
    }
517

            
518
    /// If this item is present, parse its argument at position `idx`.
519
    /// Treat the absence or malformedness of the argument as an error,
520
    /// but treat the absence of this item as acceptable.
521
2294
    pub(crate) fn parse_arg<V: FromStr>(&self, idx: usize) -> Result<Option<V>>
522
2294
    where
523
2294
        Error: From<V::Err>,
524
    {
525
2294
        match self.0 {
526
2292
            Some(item) => match item.parse_arg(idx) {
527
2290
                Ok(v) => Ok(Some(v)),
528
2
                Err(e) => Err(e.or_at_pos(self.pos())),
529
            },
530
2
            None => Ok(None),
531
        }
532
2294
    }
533
    /// If this item is present, return its arguments as a single string.
534
4146
    pub(crate) fn args_as_str(&self) -> Option<&str> {
535
4146
        self.0.map(|item| item.args_as_str())
536
4146
    }
537
    /// If this item is present, parse all of its arguments as a
538
    /// single string.
539
6122
    pub(crate) fn parse_args_as_str<V: FromStr>(&self) -> Result<Option<V>>
540
6122
    where
541
6122
        Error: From<V::Err>,
542
    {
543
6122
        match self.0 {
544
2592
            Some(item) => match item.args_as_str().parse::<V>() {
545
2588
                Ok(v) => Ok(Some(v)),
546
4
                Err(e) => {
547
4
                    let e: Error = e.into();
548
4
                    Err(e.or_at_pos(self.pos()))
549
                }
550
            },
551
3530
            None => Ok(None),
552
        }
553
6122
    }
554
}
555

            
556
/// Extension trait for `Result<Item>` -- makes it convenient to implement
557
/// PauseAt predicates
558
pub(crate) trait ItemResult<K: Keyword> {
559
    /// Return true if this is an ok result with an annotation.
560
    fn is_ok_with_annotation(&self) -> bool;
561
    /// Return true if this is an ok result with a non-annotation.
562
    fn is_ok_with_non_annotation(&self) -> bool;
563
    /// Return true if this is an ok result with the keyword 'k'
564
20160
    fn is_ok_with_kwd(&self, k: K) -> bool {
565
20160
        self.is_ok_with_kwd_in(&[k])
566
20160
    }
567
    /// Return true if this is an ok result with a keyword in the slice 'ks'
568
    fn is_ok_with_kwd_in(&self, ks: &[K]) -> bool;
569
    /// Return true if this is an ok result with a keyword not in the slice 'ks'
570
    fn is_ok_with_kwd_not_in(&self, ks: &[K]) -> bool;
571
    /// Return true if this is an empty-line error.
572
    fn is_empty_line(&self) -> bool;
573
}
574

            
575
impl<'a, K: Keyword> ItemResult<K> for Result<Item<'a, K>> {
576
4614
    fn is_ok_with_annotation(&self) -> bool {
577
4614
        match self {
578
4600
            Ok(item) => item.kwd().is_annotation(),
579
14
            Err(_) => false,
580
        }
581
4614
    }
582
42
    fn is_ok_with_non_annotation(&self) -> bool {
583
42
        match self {
584
38
            Ok(item) => !item.kwd().is_annotation(),
585
4
            Err(_) => false,
586
        }
587
42
    }
588
70492
    fn is_ok_with_kwd_in(&self, ks: &[K]) -> bool {
589
70492
        match self {
590
70424
            Ok(item) => item.has_kwd_in(ks),
591
68
            Err(_) => false,
592
        }
593
70492
    }
594
7012
    fn is_ok_with_kwd_not_in(&self, ks: &[K]) -> bool {
595
7012
        match self {
596
6902
            Ok(item) => !item.has_kwd_in(ks),
597
110
            Err(_) => false,
598
        }
599
7012
    }
600
4608
    fn is_empty_line(&self) -> bool {
601
12
        matches!(
602
12
            self,
603
12
            Err(e) if e.netdoc_error_kind() == crate::err::NetdocErrorKind::EmptyLine
604
        )
605
4608
    }
606
}
607

            
608
/// A peekable cursor into a string that returns Items one by one.
609
///
610
/// This is an [`Iterator`], yielding [`Item`]s.
611
#[derive(Debug)]
612
pub(crate) struct NetDocReader<'a, K: Keyword> {
613
    // TODO: I wish there were some way around having this string
614
    // reference, since we already need one inside NetDocReaderBase.
615
    /// The underlying string being parsed.
616
    s: &'a str,
617
    /// A stream of tokens being parsed by this NetDocReader.
618
    tokens: Peekable<NetDocReaderBase<'a, K>>,
619
}
620

            
621
impl<'a, K: Keyword> NetDocReader<'a, K> {
622
    /// Construct a new NetDocReader to read tokens from `s`.
623
5448
    pub(crate) fn new(s: &'a str) -> Result<Self> {
624
        Ok(NetDocReader {
625
5448
            s,
626
5448
            tokens: NetDocReaderBase::new(s)?.peekable(),
627
        })
628
5448
    }
629
    /// Return a reference to the string used for this NetDocReader.
630
5148
    pub(crate) fn str(&self) -> &'a str {
631
5148
        self.s
632
5148
    }
633
    /// Return a wrapper around the peekable iterator in this
634
    /// NetDocReader that reads tokens until it reaches an element where
635
    /// 'f' is true.
636
11618
    pub(crate) fn pause_at<'f, 'r, F>(
637
11618
        &mut self,
638
11618
        mut f: F,
639
11618
    ) -> itertools::PeekingTakeWhile<
640
11618
        '_,
641
11618
        Self,
642
11618
        impl FnMut(&Result<Item<'a, K>>) -> bool + 'f + use<'a, 'f, F, K>,
643
11618
    >
644
11618
    where
645
11618
        'f: 'r,
646
11618
        F: FnMut(&Result<Item<'a, K>>) -> bool + 'f,
647
11618
        K: 'f,
648
    {
649
80926
        self.peeking_take_while(move |i| !f(i))
650
11618
    }
651

            
652
    /// Return true if there are no more items in this NetDocReader.
653
    // The implementation sadly needs to mutate the inner state, even if it's not *semantically*
654
    // mutated..  We don't want inner mutability just to placate clippy for an internal API.
655
    #[allow(clippy::wrong_self_convention)]
656
    #[allow(dead_code)] // TODO perhaps we should remove this ?
657
    pub(crate) fn is_exhausted(&mut self) -> bool {
658
        self.peek().is_none()
659
    }
660

            
661
    /// Give an error if there are remaining tokens in this NetDocReader.
662
2404
    pub(crate) fn should_be_exhausted(&mut self) -> Result<()> {
663
2404
        match self.peek() {
664
2402
            None => Ok(()),
665
2
            Some(Ok(t)) => Err(EK::UnexpectedToken
666
2
                .with_msg(t.kwd().to_str())
667
2
                .at_pos(t.pos())),
668
            Some(Err(e)) => Err(e.clone()),
669
        }
670
2404
    }
671

            
672
    /// Give an error if there are remaining tokens in this NetDocReader.
673
    ///
674
    /// Like [`should_be_exhausted`](Self::should_be_exhausted),
675
    /// but permit empty lines at the end of the document.
676
2274
    pub(crate) fn should_be_exhausted_but_for_empty_lines(&mut self) -> Result<()> {
677
        use crate::err::NetdocErrorKind as K;
678
2276
        while let Some(Err(e)) = self.peek() {
679
2
            if e.netdoc_error_kind() == K::EmptyLine {
680
2
                let _ignore = self.next();
681
2
            } else {
682
                break;
683
            }
684
        }
685
2274
        self.should_be_exhausted()
686
2274
    }
687

            
688
    /// Return the position from which the underlying reader is about to take
689
    /// the next token.  Use to make sure that the reader is progressing.
690
2530
    pub(crate) fn pos(&mut self) -> Pos {
691
2530
        match self.tokens.peek() {
692
2524
            Some(Ok(tok)) => tok.pos(),
693
2
            Some(Err(e)) => e.pos(),
694
4
            None => Pos::at_end_of(self.s),
695
        }
696
2530
    }
697
}
698

            
699
impl<'a, K: Keyword> Iterator for NetDocReader<'a, K> {
700
    type Item = Result<Item<'a, K>>;
701
107138
    fn next(&mut self) -> Option<Self::Item> {
702
107138
        self.tokens.next()
703
107138
    }
704
}
705

            
706
impl<'a, K: Keyword> PeekableIterator for NetDocReader<'a, K> {
707
113950
    fn peek(&mut self) -> Option<&Self::Item> {
708
113950
        self.tokens.peek()
709
113950
    }
710
}
711

            
712
impl<'a, K: Keyword> itertools::PeekingNext for NetDocReader<'a, K> {
713
83390
    fn peeking_next<F>(&mut self, f: F) -> Option<Self::Item>
714
83390
    where
715
83390
        F: FnOnce(&Self::Item) -> bool,
716
    {
717
83390
        if f(self.peek()?) { self.next() } else { None }
718
83390
    }
719
}
720

            
721
/// Check additional UTF-8 rules that the netdoc metaformat imposes on
722
/// our documents.
723
//
724
// NOTE: We might decide in the future to loosen our rules here
725
// for parsers that handle concatenated documents:
726
// we might want to reject only those documents that contain NULs.
727
// But with luck that will never be necessary.
728
5468
fn validate_utf_8_rules(s: &str) -> Result<&str> {
729
    // No BOM, or mangled BOM, is allowed.
730
5468
    let first_char = s.chars().next();
731
5468
    if [Some('\u{feff}'), Some('\u{fffe}')].contains(&first_char) {
732
6
        return Err(EK::BomMarkerFound.at_pos(Pos::at(s)));
733
5462
    }
734
    // No NUL bytes are allowed.
735
5462
    if let Some(nul_pos) = memchr::memchr(0, s.as_bytes()) {
736
10
        return Err(EK::NulFound.at_pos(Pos::from_byte(nul_pos)));
737
5452
    }
738
5452
    Ok(s)
739
5468
}
740

            
741
#[cfg(test)]
742
mod test {
743
    // @@ begin test lint list maintained by maint/add_warning @@
744
    #![allow(clippy::bool_assert_comparison)]
745
    #![allow(clippy::clone_on_copy)]
746
    #![allow(clippy::dbg_macro)]
747
    #![allow(clippy::mixed_attributes_style)]
748
    #![allow(clippy::print_stderr)]
749
    #![allow(clippy::print_stdout)]
750
    #![allow(clippy::single_char_pattern)]
751
    #![allow(clippy::unwrap_used)]
752
    #![allow(clippy::unchecked_time_subtraction)]
753
    #![allow(clippy::useless_vec)]
754
    #![allow(clippy::needless_pass_by_value)]
755
    #![allow(clippy::string_slice)] // See arti#2571
756
    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
757
    #![allow(clippy::cognitive_complexity)]
758
    use super::*;
759
    use crate::parse::macros::test::Fruit;
760
    use crate::{NetdocErrorKind as EK, Pos, Result};
761

            
762
    #[test]
763
    fn read_simple() {
764
        use Fruit::*;
765

            
766
        let s = "\
767
@tasty very much so
768
opt apple 77
769
banana 60
770
cherry 6
771
-----BEGIN CHERRY SYNOPSIS-----
772
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
773
-----END CHERRY SYNOPSIS-----
774
plum hello there
775
";
776
        let mut r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
777

            
778
        assert_eq!(r.str(), s);
779
        assert!(r.should_be_exhausted().is_err()); // it's not exhausted.
780

            
781
        let toks: Result<Vec<_>> = r.by_ref().collect();
782
        assert!(r.should_be_exhausted().is_ok());
783

            
784
        let toks = toks.unwrap();
785
        assert_eq!(toks.len(), 5);
786
        assert_eq!(toks[0].kwd(), ANN_TASTY);
787
        assert_eq!(toks[0].n_args(), 3);
788
        assert_eq!(toks[0].args_as_str(), "very much so");
789
        assert_eq!(toks[0].arg(1), Some("much"));
790
        {
791
            let a: Vec<_> = toks[0].args().collect();
792
            assert_eq!(a, vec!["very", "much", "so"]);
793
        }
794
        assert!(toks[0].parse_arg::<usize>(0).is_err());
795
        assert!(toks[0].parse_arg::<usize>(10).is_err());
796
        assert!(!toks[0].has_obj());
797
        assert_eq!(toks[0].obj_tag(), None);
798

            
799
        assert_eq!(toks[2].pos().within(s), Pos::from_line(3, 1));
800
        assert_eq!(toks[2].arg_pos(0).within(s), Pos::from_line(3, 8));
801
        assert_eq!(toks[2].last_arg_end_pos().within(s), Pos::from_line(3, 10));
802
        assert_eq!(toks[2].end_pos().within(s), Pos::from_line(3, 10));
803

            
804
        assert_eq!(toks[3].kwd(), STONEFRUIT);
805
        assert_eq!(toks[3].kwd_str(), "cherry"); // not cherry/plum!
806
        assert_eq!(toks[3].n_args(), 1);
807
        assert_eq!(toks[3].required_arg(0), Ok("6"));
808
        assert_eq!(toks[3].parse_arg::<usize>(0), Ok(6));
809
        assert_eq!(toks[3].parse_optional_arg::<usize>(0), Ok(Some(6)));
810
        assert_eq!(toks[3].parse_optional_arg::<usize>(3), Ok(None));
811
        assert!(toks[3].has_obj());
812
        assert_eq!(toks[3].obj_tag(), Some("CHERRY SYNOPSIS"));
813
        assert_eq!(
814
            &toks[3].obj("CHERRY SYNOPSIS").unwrap()[..],
815
            "🍒🍒🍒🍒🍒🍒".as_bytes()
816
        );
817
        assert!(toks[3].obj("PLUOT SYNOPSIS").is_err());
818
        // this "end-pos" value is questionable!
819
        assert_eq!(toks[3].end_pos().within(s), Pos::from_line(7, 30));
820
    }
821

            
822
    #[test]
823
    fn test_badtoks() {
824
        use Fruit::*;
825

            
826
        let s = "\
827
-foobar 9090
828
apple 3.14159
829
$hello
830
unrecognized 127.0.0.1 foo
831
plum
832
-----BEGIN WHATEVER-----
833
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
834
-----END SOMETHING ELSE-----
835
orange
836
orange
837
-----BEGIN WHATEVER-----
838
not! base64!
839
-----END WHATEVER-----
840
guava paste
841
opt @annotation
842
orange
843
-----BEGIN LOBSTER
844
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
845
-----END SOMETHING ELSE-----
846
orange
847
-----BEGIN !!!!!!-----
848
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
849
-----END !!!!!!-----
850
cherry
851
-----BEGIN CHERRY SYNOPSIS-----
852
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
853
-----END CHERRY SYNOPSIS
854

            
855
truncated line";
856

            
857
        let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
858
        let toks: Vec<_> = r.collect();
859

            
860
        assert!(toks[0].is_err());
861
        assert_eq!(
862
            toks[0].as_ref().err().unwrap(),
863
            &EK::BadKeyword.at_pos(Pos::from_line(1, 1))
864
        );
865

            
866
        assert!(toks[1].is_ok());
867
        assert!(toks[1].is_ok_with_non_annotation());
868
        assert!(!toks[1].is_ok_with_annotation());
869
        assert!(toks[1].is_ok_with_kwd_in(&[APPLE, ORANGE]));
870
        assert!(toks[1].is_ok_with_kwd_not_in(&[ORANGE, UNRECOGNIZED]));
871
        let t = toks[1].as_ref().unwrap();
872
        assert_eq!(t.kwd(), APPLE);
873
        assert_eq!(t.arg(0), Some("3.14159"));
874

            
875
        assert!(toks[2].is_err());
876
        assert!(!toks[2].is_ok_with_non_annotation());
877
        assert!(!toks[2].is_ok_with_annotation());
878
        assert!(!toks[2].is_ok_with_kwd_in(&[APPLE, ORANGE]));
879
        assert!(!toks[2].is_ok_with_kwd_not_in(&[ORANGE, UNRECOGNIZED]));
880
        assert_eq!(
881
            toks[2].as_ref().err().unwrap(),
882
            &EK::BadKeyword.at_pos(Pos::from_line(3, 1))
883
        );
884

            
885
        assert!(toks[3].is_ok());
886
        let t = toks[3].as_ref().unwrap();
887
        assert_eq!(t.kwd(), UNRECOGNIZED);
888
        assert_eq!(t.arg(1), Some("foo"));
889

            
890
        assert!(toks[4].is_err());
891
        assert_eq!(
892
            toks[4].as_ref().err().unwrap(),
893
            &EK::BadObjectMismatchedTag.at_pos(Pos::from_line(8, 1))
894
        );
895

            
896
        assert!(toks[5].is_ok());
897
        let t = toks[5].as_ref().unwrap();
898
        assert_eq!(t.kwd(), ORANGE);
899
        assert_eq!(t.args_as_str(), "");
900

            
901
        // This blob counts as two errors: a bad base64 blob, and
902
        // then an end line.
903
        assert!(toks[6].is_err());
904
        assert_eq!(
905
            toks[6].as_ref().err().unwrap(),
906
            &EK::BadObjectBase64.at_pos(Pos::from_line(12, 1))
907
        );
908

            
909
        assert!(toks[7].is_err());
910
        assert_eq!(
911
            toks[7].as_ref().err().unwrap(),
912
            &EK::BadKeyword.at_pos(Pos::from_line(13, 1))
913
        );
914

            
915
        assert!(toks[8].is_ok());
916
        let t = toks[8].as_ref().unwrap();
917
        assert_eq!(t.kwd(), GUAVA);
918

            
919
        // this is an error because you can't use opt with annotations.
920
        assert!(toks[9].is_err());
921
        assert_eq!(
922
            toks[9].as_ref().err().unwrap(),
923
            &EK::BadKeyword.at_pos(Pos::from_line(15, 1))
924
        );
925

            
926
        // this looks like a few errors.
927
        assert!(toks[10].is_err());
928
        assert_eq!(
929
            toks[10].as_ref().err().unwrap(),
930
            &EK::BadObjectBeginTag.at_pos(Pos::from_line(17, 1))
931
        );
932
        assert!(toks[11].is_err());
933
        assert_eq!(
934
            toks[11].as_ref().err().unwrap(),
935
            &EK::BadKeyword.at_pos(Pos::from_line(18, 1))
936
        );
937
        assert!(toks[12].is_err());
938
        assert_eq!(
939
            toks[12].as_ref().err().unwrap(),
940
            &EK::BadKeyword.at_pos(Pos::from_line(19, 1))
941
        );
942

            
943
        // so does this.
944
        assert!(toks[13].is_err());
945
        assert_eq!(
946
            toks[13].as_ref().err().unwrap(),
947
            &EK::BadObjectBeginTag.at_pos(Pos::from_line(21, 1))
948
        );
949
        assert!(toks[14].is_err());
950
        assert_eq!(
951
            toks[14].as_ref().err().unwrap(),
952
            &EK::BadKeyword.at_pos(Pos::from_line(22, 1))
953
        );
954
        assert!(toks[15].is_err());
955
        assert_eq!(
956
            toks[15].as_ref().err().unwrap(),
957
            &EK::BadKeyword.at_pos(Pos::from_line(23, 1))
958
        );
959

            
960
        // not this.
961
        assert!(toks[16].is_err());
962
        assert_eq!(
963
            toks[16].as_ref().err().unwrap(),
964
            &EK::BadObjectEndTag.at_pos(Pos::from_line(27, 1))
965
        );
966

            
967
        assert!(toks[17].is_err());
968
        assert_eq!(
969
            toks[17].as_ref().err().unwrap(),
970
            &EK::EmptyLine.at_pos(Pos::from_line(28, 1))
971
        );
972

            
973
        assert!(toks[18].is_err());
974
        assert_eq!(
975
            toks[18].as_ref().err().unwrap(),
976
            &EK::TruncatedLine.at_pos(Pos::from_line(29, 15))
977
        );
978
    }
979

            
980
    #[test]
981
    fn test_leading_space_forbidden() {
982
        // We need to make sure that items with a leading space aren't accepted:
983
        // the spec forbids it, and it can provide a vector for inflating the size
984
        // of downloaded hsdescs (see prop360).
985

            
986
        // Try a simple item with a space at the front.
987
        let s = "    guava space\n";
988
        let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
989
        let toks: Vec<_> = r.collect();
990

            
991
        // No space allowed at the start of a line.
992
        assert_eq!(
993
            toks[0].as_ref().err().unwrap(),
994
            &EK::BadKeyword.at_pos(Pos::from_line(1, 1))
995
        );
996

            
997
        // Try an item with an object, inserting space at the start of each ine in turn.
998
        let s = "cherry
999
-----BEGIN WHATEVER-----
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
-----END WHATEVER-----
";
        let orig_lines = s
            .split_terminator('\n')
            .map(str::to_string)
            .collect::<Vec<_>>();
        assert_eq!(orig_lines.len(), 4);
        let expected_kinds = [
            EK::BadKeyword,
            EK::BadKeyword,
            EK::BadObjectBase64,
            EK::BadObjectBase64,
        ];
        for pos in 0..orig_lines.len() {
            let mut lines = orig_lines.clone();
            lines[pos] = format!(" {}", lines[pos]);
            let joined = format!("{}\n", lines.join("\n"));
            let r: NetDocReader<'_, Fruit> = NetDocReader::new(&joined).unwrap();
            let toks: Result<Vec<_>> = r.collect();
            assert_eq!(toks.unwrap_err().netdoc_error_kind(), expected_kinds[pos]);
        }
    }
    #[test]
    fn test_validate_strings() {
        use validate_utf_8_rules as v;
        assert_eq!(v(""), Ok(""));
        assert_eq!(v("hello world"), Ok("hello world"));
        // We don't have to test a lot more valid cases, since this function is called before
        // parsing any string.
        for s in ["\u{feff}", "\u{feff}hello world", "\u{fffe}hello world"] {
            let e = v(s).unwrap_err();
            assert_eq!(e.netdoc_error_kind(), EK::BomMarkerFound);
            assert_eq!(e.pos().offset_within(s), Some(0));
        }
        for s in [
            "\0hello world",
            "\0",
            "\0\0\0",
            "hello\0world",
            "hello world\0",
        ] {
            let e = v(s).unwrap_err();
            assert_eq!(e.netdoc_error_kind(), EK::NulFound);
            let nul_pos = e.pos().offset_within(s).unwrap();
            assert_eq!(s.as_bytes()[nul_pos], 0);
        }
    }
    fn single_fruit(s: &str) -> Item<'_, Fruit> {
        NetDocReader::<Fruit>::new(s)
            .unwrap()
            .next()
            .unwrap()
            .unwrap()
    }
    #[test]
    fn end_of_item() {
        let s = "guava friends 123   \n";
        let item = single_fruit(s);
        assert_eq!(
            item.end_pos().within(s),
            Pos::from_byte(s.find('\n').unwrap()).within(s)
        );
        let s = "cherry
-----BEGIN WHATEVER-----
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
-----END WHATEVER-----\n";
        let item = single_fruit(s);
        dbg!(&item);
        assert_eq!(
            item.end_pos().within(s),
            Pos::from_byte(s.rfind('\n').unwrap()).within(s)
        );
    }
}