Skip to content

Commit bdca6f3

Browse files
committed
refactor: extract bencode tokenizer
Split parser logic into two types: - Tokenizer: It returns bencoded tokens. - Generator: It iterator over bencoded tokens to generate the JSON.
1 parent a2eb63c commit bdca6f3

File tree

3 files changed

+109
-29
lines changed

3 files changed

+109
-29
lines changed

src/parsers/integer.rs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,13 @@ enum StateExpecting {
3131
///
3232
/// Will panic if we reach the end of the input without completing the integer
3333
/// (without reaching the end of the integer `e`).
34-
pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) -> Result<(), Error> {
34+
pub fn parse<R: Read, W: Writer>(
35+
reader: &mut ByteReader<R>,
36+
writer: &mut W,
37+
) -> Result<Vec<u8>, Error> {
3538
let mut state = StateExpecting::Start;
3639
let mut first_digit_is_zero = false;
40+
let mut value = vec![];
3741

3842
loop {
3943
let byte = next_byte(reader, writer)?;
@@ -48,10 +52,12 @@ pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) ->
4852
StateExpecting::DigitOrSign => {
4953
if char == '-' {
5054
writer.write_byte(byte)?;
55+
value.push(byte);
5156

5257
StateExpecting::DigitAfterSign
5358
} else if char.is_ascii_digit() {
5459
writer.write_byte(byte)?;
60+
value.push(byte);
5561

5662
if char == '0' {
5763
first_digit_is_zero = true;
@@ -76,6 +82,7 @@ pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) ->
7682
StateExpecting::DigitAfterSign => {
7783
if char.is_ascii_digit() {
7884
writer.write_byte(byte)?;
85+
value.push(byte);
7986

8087
if char == '0' {
8188
first_digit_is_zero = true;
@@ -100,6 +107,7 @@ pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) ->
100107
StateExpecting::DigitOrEnd => {
101108
if char.is_ascii_digit() {
102109
writer.write_byte(byte)?;
110+
value.push(byte);
103111

104112
if char == '0' && first_digit_is_zero {
105113
return Err(Error::LeadingZerosInIntegersNotAllowed(
@@ -118,7 +126,7 @@ pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) ->
118126

119127
StateExpecting::DigitOrEnd
120128
} else if byte == BENCODE_END_INTEGER {
121-
return Ok(());
129+
return Ok(value);
122130
} else {
123131
return Err(Error::UnexpectedByteParsingInteger(
124132
ReadContext {
@@ -185,12 +193,12 @@ mod tests {
185193
let mut output = String::new();
186194

187195
match parse_bencode(input_buffer, &mut output) {
188-
Ok(()) => Ok(output),
196+
Ok(_value) => Ok(output),
189197
Err(err) => Err(err),
190198
}
191199
}
192200

193-
fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<(), Error> {
201+
fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<Vec<u8>, Error> {
194202
let mut reader = ByteReader::new(input_buffer);
195203

196204
let mut writer = StringWriter::new(output);

src/parsers/mod.rs

Lines changed: 86 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ pub mod integer;
88
pub mod stack;
99
pub mod string;
1010

11+
use core::str;
1112
use std::{
1213
fmt::Write as FmtWrite,
1314
io::{self, Read, Write as IoWrite},
@@ -36,6 +37,16 @@ pub enum BencodeType {
3637
Dict,
3738
}
3839

40+
#[derive(Debug, PartialEq)]
41+
pub enum BencodeToken {
42+
Integer(Vec<u8>),
43+
String(Vec<u8>),
44+
BeginList,
45+
BeginDict,
46+
EndListOrDict,
47+
LineBreak,
48+
}
49+
3950
pub struct BencodeParser<R: Read> {
4051
byte_reader: ByteReader<R>,
4152
num_processed_tokens: u64,
@@ -104,35 +115,39 @@ impl<R: Read> BencodeParser<R> {
104115
/// - It can't read from the input or write to the output.
105116
/// - The input is invalid Bencode.
106117
fn parse<W: Writer>(&mut self, writer: &mut W) -> Result<(), error::Error> {
107-
while let Some(peeked_byte) = Self::peek_byte(&mut self.byte_reader, writer)? {
108-
match peeked_byte {
118+
let capture_output = Vec::new();
119+
let mut null_writer = ByteWriter::new(capture_output);
120+
121+
while let Some(peeked_byte) = Self::peek_byte(&mut self.byte_reader, &null_writer)? {
122+
let token: BencodeToken = match peeked_byte {
109123
BENCODE_BEGIN_INTEGER => {
110-
self.begin_bencoded_value(BencodeType::Integer, writer)?;
111-
integer::parse(&mut self.byte_reader, writer)?;
124+
let value = integer::parse(&mut self.byte_reader, &mut null_writer)?;
125+
BencodeToken::Integer(value)
112126
}
113127
b'0'..=b'9' => {
114-
self.begin_bencoded_value(BencodeType::String, writer)?;
115-
string::parse(&mut self.byte_reader, writer)?;
128+
let value = string::parse(&mut self.byte_reader, &mut null_writer)?;
129+
BencodeToken::String(value)
116130
}
117131
BENCODE_BEGIN_LIST => {
118-
let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
119-
self.begin_bencoded_value(BencodeType::List, writer)?;
120-
writer.write_byte(Self::JSON_ARRAY_BEGIN)?;
121-
self.stack.push(State::ExpectingFirstListItemOrEnd);
132+
let _byte =
133+
Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?;
134+
BencodeToken::BeginList
122135
}
123136
BENCODE_BEGIN_DICT => {
124-
let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
125-
self.begin_bencoded_value(BencodeType::Dict, writer)?;
126-
writer.write_byte(Self::JSON_OBJ_BEGIN)?;
127-
self.stack.push(State::ExpectingFirstDictFieldOrEnd);
137+
let _byte =
138+
Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?;
139+
BencodeToken::BeginDict
128140
}
129141
BENCODE_END_LIST_OR_DICT => {
130-
let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
131-
self.end_list_or_dict(writer)?;
142+
let _byte =
143+
Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?;
144+
BencodeToken::EndListOrDict
132145
}
133146
b'\n' => {
134147
// Ignore line breaks at the beginning, the end, or between values
135-
let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
148+
let _byte =
149+
Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?;
150+
BencodeToken::LineBreak
136151
}
137152
_ => {
138153
return Err(error::Error::UnrecognizedFirstBencodeValueByte(
@@ -148,6 +163,60 @@ impl<R: Read> BencodeParser<R> {
148163
},
149164
));
150165
}
166+
};
167+
168+
/* TODO:
169+
170+
- Extract tokenizer (without implementing the Iterator trait).
171+
- Remove writer from tokenizer.
172+
- Implement trait Iterator for tokenizer.
173+
- Rename this parser to generator.
174+
175+
*/
176+
177+
match token {
178+
BencodeToken::Integer(integer_bytes) => {
179+
self.begin_bencoded_value(BencodeType::Integer, writer)?;
180+
// todo: add `write_bytes` to writer.
181+
for bytes in integer_bytes {
182+
writer.write_byte(bytes)?;
183+
}
184+
}
185+
BencodeToken::String(string_bytes) => {
186+
self.begin_bencoded_value(BencodeType::String, writer)?;
187+
188+
let html_tag_style_string = match str::from_utf8(&string_bytes) {
189+
Ok(string) => {
190+
// String only contains valid UTF-8 chars -> print it as it's
191+
&format!("<string>{}</string>", string.to_owned())
192+
}
193+
Err(_) => {
194+
// String contains non valid UTF-8 chars -> print it as hex bytes
195+
&format!("<hex>{}</hex>", hex::encode(string_bytes))
196+
}
197+
};
198+
199+
writer.write_str(
200+
&serde_json::to_string(&html_tag_style_string)
201+
.expect("Failed to serialize to JSON. This should not happen because non UTF-8 bencoded string are serialized as hex bytes"),
202+
)?;
203+
}
204+
BencodeToken::BeginList => {
205+
self.begin_bencoded_value(BencodeType::List, writer)?;
206+
writer.write_byte(Self::JSON_ARRAY_BEGIN)?;
207+
self.stack.push(State::ExpectingFirstListItemOrEnd);
208+
}
209+
BencodeToken::BeginDict => {
210+
self.begin_bencoded_value(BencodeType::Dict, writer)?;
211+
writer.write_byte(Self::JSON_OBJ_BEGIN)?;
212+
self.stack.push(State::ExpectingFirstDictFieldOrEnd);
213+
}
214+
BencodeToken::EndListOrDict => {
215+
self.end_list_or_dict(writer)?;
216+
}
217+
BencodeToken::LineBreak => {
218+
// Ignore line breaks at the beginning, the end, or between values
219+
}
151220
}
152221

153222
self.num_processed_tokens += 1;

src/parsers/string.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ use super::error::{Error, ReadContext, WriteContext};
2525
/// # Panics
2626
///
2727
/// Will panic if we reach the end of the input without completing the string.
28-
pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) -> Result<(), Error> {
28+
pub fn parse<R: Read, W: Writer>(
29+
reader: &mut ByteReader<R>,
30+
writer: &mut W,
31+
) -> Result<Vec<u8>, Error> {
2932
let mut string_parser = StringParser::default();
3033
string_parser.parse(reader, writer)
3134
}
@@ -46,20 +49,20 @@ impl StringParser {
4649
&mut self,
4750
reader: &mut ByteReader<R>,
4851
writer: &mut W,
49-
) -> Result<(), Error> {
52+
) -> Result<Vec<u8>, Error> {
5053
let mut length = Length::default();
5154

5255
length.parse(reader, writer)?;
5356

5457
let mut value = Value::new(length.number);
5558

56-
value.parse(reader, writer)?;
59+
let value_bytes = value.parse(reader, writer)?;
5760

5861
self.parsed_value = value.utf8();
5962

6063
writer.write_str(&self.json())?;
6164

62-
Ok(())
65+
Ok(value_bytes)
6366
}
6467

6568
/// It returns the final parsed value as string.
@@ -202,12 +205,12 @@ impl Value {
202205
&mut self,
203206
reader: &mut ByteReader<R>,
204207
writer: &W,
205-
) -> Result<(), Error> {
208+
) -> Result<Vec<u8>, Error> {
206209
for _i in 1..=self.length {
207210
self.add_byte(Self::next_byte(reader, writer)?);
208211
}
209212

210-
Ok(())
213+
Ok(self.bytes.clone())
211214
}
212215

213216
/// It reads the next byte from the input.
@@ -282,12 +285,12 @@ mod tests {
282285
let mut output = String::new();
283286

284287
match parse_bencode(input_buffer, &mut output) {
285-
Ok(()) => Ok(output),
288+
Ok(_string_value_bytes) => Ok(output),
286289
Err(err) => Err(err),
287290
}
288291
}
289292

290-
fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<(), Error> {
293+
fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<Vec<u8>, Error> {
291294
let mut reader = ByteReader::new(input_buffer);
292295

293296
let mut writer = StringWriter::new(output);

0 commit comments

Comments
 (0)