diff --git a/README.md b/README.md
index 2cd943a..5285fb7 100644
--- a/README.md
+++ b/README.md
@@ -65,12 +65,12 @@ Error: Unexpected end of input parsing integer; read context: input pos 3, lates
```console
printf "3:ab" | cargo run
-Error: Unexpected end of input parsing string value; read context: input pos 4, latest input bytes dump: [51, 58, 97, 98] (UTF-8 string: `3:ab`); write context: output pos 0, latest output bytes dump: [] (UTF-8 string: ``)
+Error: Unexpected end of input parsing string value; read context: input pos 4, latest input bytes dump: [51, 58, 97, 98] (UTF-8 string: `3:ab`)
```
```console
echo "i00e" | cargo run
-Error: Leading zeros in integers are not allowed, for example b'i00e'; read context: byte `48` (char: `0`), input pos 3, latest input bytes dump: [105, 48, 48] (UTF-8 string: `i00`); write context: byte `48` (char: `0`), output pos 2, latest output bytes dump: [48, 48] (UTF-8 string: `00`)
+Error: Leading zeros in integers are not allowed, for example b'i00e'; read context: byte `48` (char: `0`), input pos 3, latest input bytes dump: [105, 48, 48] (UTF-8 string: `i00`)
```
Generating pretty JSON with [jq][jq]:
@@ -111,36 +111,10 @@ cargo add bencode2json
There two ways of using the library:
-- With high-level parser wrappers.
-- With the low-level parsers.
+- With high-level wrappers.
+- With the low-level generators.
-Example using the high-level parser wrappers:
-
-```rust
-use bencode2json::{try_bencode_to_json};
-
-let result = try_bencode_to_json(b"d4:spam4:eggse").unwrap();
-
-assert_eq!(result, r#"{"spam":"eggsstring>"}"#);
-```
-
-Example using the low-level parser:
-
-```rust
-use bencode2json::parsers::{BencodeParser};
-
-let mut output = String::new();
-
-let mut parser = BencodeParser::new(&b"4:spam"[..]);
-
-parser
- .write_str(&mut output)
- .expect("Bencode to JSON conversion failed");
-
-println!("{output}"); // It prints the JSON string: "spam"
-```
-
-More [examples](./examples/).
+See [examples](./examples/).
## Test
@@ -167,21 +141,19 @@ cargo cov
## Performance
In terms of memory usage this implementation consumes at least the size of the
-biggest bencoded string. The string parser keeps all the string bytes in memory until
-it parses the whole string, in order to convert it to UTF-8, when it's possible.
+biggest bencoded integer or string. The string and integer parsers keeps all the bytes in memory until
+it parses the whole value.
The library also wraps the input and output streams in a [BufReader](https://doc.rust-lang.org/std/io/struct.BufReader.html)
and [BufWriter](https://doc.rust-lang.org/std/io/struct.BufWriter.html) because it can be excessively inefficient to work directly with something that implements [Read](https://doc.rust-lang.org/std/io/trait.Read.html) or [Write](https://doc.rust-lang.org/std/io/trait.Write.html).
## TODO
-- [ ] More examples of using the library.
- [ ] Counter for number of items in a list for debugging and errors.
- [ ] Fuzz testing: Generate random valid bencoded values.
- [ ] Install tracing crate. Add verbose mode that enables debugging.
- [ ] Option to check if the final JSON it's valid at the end of the process.
- [ ] Benchmarking for this implementation and the original C implementation.
-- [ ] Optimize string parser. We can stop trying to convert the string to UTF-8 when we find a non valid UTF-8 char.
## Alternatives
diff --git a/examples/parser_file_in_file_out.rs b/examples/parser_file_in_file_out.rs
index 732f9b8..06acb13 100644
--- a/examples/parser_file_in_file_out.rs
+++ b/examples/parser_file_in_file_out.rs
@@ -10,7 +10,7 @@ use std::{
io::{Read, Write},
};
-use bencode2json::parsers::BencodeParser;
+use bencode2json::generators::json::Generator;
use clap::{Arg, Command};
fn main() {
@@ -61,7 +61,7 @@ fn main() {
std::process::exit(1);
};
- if let Err(e) = BencodeParser::new(input).write_bytes(&mut output) {
+ if let Err(e) = Generator::new(input).write_bytes(&mut output) {
eprintln!("Error: {e}");
std::process::exit(1);
}
diff --git a/examples/parser_stdin_stdout.rs b/examples/parser_stdin_stdout.rs
index 4f8e162..01456cb 100644
--- a/examples/parser_stdin_stdout.rs
+++ b/examples/parser_stdin_stdout.rs
@@ -7,13 +7,13 @@
//! It prints "spam".
use std::io;
-use bencode2json::parsers::BencodeParser;
+use bencode2json::generators::json::Generator;
fn main() {
let input = Box::new(io::stdin());
let mut output = Box::new(io::stdout());
- if let Err(e) = BencodeParser::new(input).write_bytes(&mut output) {
+ if let Err(e) = Generator::new(input).write_bytes(&mut output) {
eprintln!("Error: {e}");
std::process::exit(1);
}
diff --git a/examples/parser_string_in_string_out.rs b/examples/parser_string_in_string_out.rs
index 638304e..6c7bf7c 100644
--- a/examples/parser_string_in_string_out.rs
+++ b/examples/parser_string_in_string_out.rs
@@ -5,13 +5,13 @@
//! ```
//!
//! It prints "spam".
-use bencode2json::parsers::BencodeParser;
+use bencode2json::generators::json::Generator;
fn main() {
let input = "4:spam".to_string();
let mut output = String::new();
- if let Err(e) = BencodeParser::new(input.as_bytes()).write_str(&mut output) {
+ if let Err(e) = Generator::new(input.as_bytes()).write_str(&mut output) {
eprintln!("Error: {e}");
std::process::exit(1);
}
diff --git a/examples/parser_string_in_vec_out.rs b/examples/parser_string_in_vec_out.rs
index 56c073b..3d89dfd 100644
--- a/examples/parser_string_in_vec_out.rs
+++ b/examples/parser_string_in_vec_out.rs
@@ -5,13 +5,13 @@
//! ```
//!
//! It prints "spam".
-use bencode2json::parsers::BencodeParser;
+use bencode2json::generators::json::Generator;
fn main() {
let input = "4:spam".to_string();
let mut output = Vec::new();
- if let Err(e) = BencodeParser::new(input.as_bytes()).write_bytes(&mut output) {
+ if let Err(e) = Generator::new(input.as_bytes()).write_bytes(&mut output) {
eprintln!("Error: {e}");
std::process::exit(1);
}
diff --git a/examples/parser_vec_in_string_out.rs b/examples/parser_vec_in_string_out.rs
index fc27d07..1388f51 100644
--- a/examples/parser_vec_in_string_out.rs
+++ b/examples/parser_vec_in_string_out.rs
@@ -5,13 +5,13 @@
//! ```
//!
//! It prints "spam".
-use bencode2json::parsers::BencodeParser;
+use bencode2json::generators::json::Generator;
fn main() {
let input = b"4:spam".to_vec();
let mut output = String::new();
- if let Err(e) = BencodeParser::new(&input[..]).write_str(&mut output) {
+ if let Err(e) = Generator::new(&input[..]).write_str(&mut output) {
eprintln!("Error: {e}");
std::process::exit(1);
}
diff --git a/examples/parser_vec_in_vec_out.rs b/examples/parser_vec_in_vec_out.rs
index 57ac35d..d678bff 100644
--- a/examples/parser_vec_in_vec_out.rs
+++ b/examples/parser_vec_in_vec_out.rs
@@ -5,13 +5,13 @@
//! ```
//!
//! It prints "spam".
-use bencode2json::parsers::BencodeParser;
+use bencode2json::generators::json::Generator;
fn main() {
let input = b"4:spam".to_vec();
let mut output = Vec::new();
- if let Err(e) = BencodeParser::new(&input[..]).write_bytes(&mut output) {
+ if let Err(e) = Generator::new(&input[..]).write_bytes(&mut output) {
eprintln!("Error: {e}");
std::process::exit(1);
}
diff --git a/src/parsers/error.rs b/src/error.rs
similarity index 88%
rename from src/parsers/error.rs
rename to src/error.rs
index 6874bd6..2f581a3 100644
--- a/src/parsers/error.rs
+++ b/src/error.rs
@@ -9,7 +9,7 @@ use thiserror::Error;
use crate::rw;
-use super::BencodeType;
+use super::generators::BencodeType;
/// Errors that can occur while parsing a bencoded value.
#[derive(Debug, Error)]
@@ -27,16 +27,16 @@ pub enum Error {
/// The main parser peeks one byte ahead to know what kind of bencoded value
/// is being parsed. If the byte read after peeking does not match the
/// peeked byte, it means the input is being consumed somewhere else.
- #[error("Read byte after peeking does match peeked byte; {0}; {1}")]
- ReadByteAfterPeekingDoesMatchPeekedByte(ReadContext, WriteContext),
+ #[error("Read byte after peeking does match peeked byte; {0}")]
+ ReadByteAfterPeekingDoesMatchPeekedByte(ReadContext),
/// Unrecognized first byte for new bencoded value.
///
/// The main parser peeks one byte ahead to know what kind of bencoded value
/// is being parsed. This error is raised when the peeked byte is not a
/// valid first byte for a bencoded value.
- #[error("Unrecognized first byte for new bencoded value; {0}; {1}")]
- UnrecognizedFirstBencodeValueByte(ReadContext, WriteContext),
+ #[error("Unrecognized first byte for new bencoded value; {0}")]
+ UnrecognizedFirstBencodeValueByte(ReadContext),
// Integers
/// Unexpected byte parsing integer.
@@ -44,38 +44,38 @@ pub enum Error {
/// The main parser parses integers by reading bytes until it finds the
/// end of the integer. This error is raised when the byte read is not a
/// valid byte for an integer bencoded value.
- #[error("Unexpected byte parsing integer; {0}; {1}")]
- UnexpectedByteParsingInteger(ReadContext, WriteContext),
+ #[error("Unexpected byte parsing integer; {0}")]
+ UnexpectedByteParsingInteger(ReadContext),
/// Unexpected end of input parsing integer.
///
/// The input ends before the integer ends.
- #[error("Unexpected end of input parsing integer; {0}; {1}")]
- UnexpectedEndOfInputParsingInteger(ReadContext, WriteContext),
+ #[error("Unexpected end of input parsing integer; {0}")]
+ UnexpectedEndOfInputParsingInteger(ReadContext),
/// Leading zeros in integers are not allowed, for example b'i00e'.
- #[error("Leading zeros in integers are not allowed, for example b'i00e'; {0}; {1}")]
- LeadingZerosInIntegersNotAllowed(ReadContext, WriteContext),
+ #[error("Leading zeros in integers are not allowed, for example b'i00e'; {0}")]
+ LeadingZerosInIntegersNotAllowed(ReadContext),
// Strings
/// Invalid string length byte, expected a digit.
///
/// The string parser found an invalid byte for the string length. The
/// length can only be made of digits (0-9).
- #[error("Invalid string length byte, expected a digit; {0}; {1}")]
- InvalidStringLengthByte(ReadContext, WriteContext),
+ #[error("Invalid string length byte, expected a digit; {0}")]
+ InvalidStringLengthByte(ReadContext),
/// Unexpected end of input parsing string length.
///
/// The input ends before the string length ends.
- #[error("Unexpected end of input parsing string length; {0}; {1}")]
- UnexpectedEndOfInputParsingStringLength(ReadContext, WriteContext),
+ #[error("Unexpected end of input parsing string length; {0}")]
+ UnexpectedEndOfInputParsingStringLength(ReadContext),
/// Unexpected end of input parsing string value.
///
/// The input ends before the string value ends.
- #[error("Unexpected end of input parsing string value; {0}; {1}")]
- UnexpectedEndOfInputParsingStringValue(ReadContext, WriteContext),
+ #[error("Unexpected end of input parsing string value; {0}")]
+ UnexpectedEndOfInputParsingStringValue(ReadContext),
// Lists
/// Unexpected end of input parsing list. Expecting first list item or list end.
@@ -121,7 +121,7 @@ pub enum Error {
NoMatchingStartForListOrDictEnd(ReadContext, WriteContext),
}
-/// The reader context when the error ocurred.
+/// The reader context when the error occurred.
#[derive(Debug)]
pub struct ReadContext {
/// The read byte that caused the error if any.
@@ -157,7 +157,7 @@ impl fmt::Display for ReadContext {
}
}
-/// The writer context when the error ocurred.
+/// The writer context when the error occurred.
#[derive(Debug)]
pub struct WriteContext {
/// The written byte that caused the error if any.
@@ -197,7 +197,7 @@ impl fmt::Display for WriteContext {
mod tests {
mod for_read_context {
- use crate::parsers::error::ReadContext;
+ use crate::error::ReadContext;
#[test]
fn it_should_display_the_read_context() {
@@ -237,7 +237,7 @@ mod tests {
}
mod for_write_context {
- use crate::parsers::error::WriteContext;
+ use crate::error::WriteContext;
#[test]
fn it_should_display_the_read_context() {
diff --git a/src/parsers/mod.rs b/src/generators/json.rs
similarity index 89%
rename from src/parsers/mod.rs
rename to src/generators/json.rs
index 44bebb8..ccfa6af 100644
--- a/src/parsers/mod.rs
+++ b/src/generators/json.rs
@@ -1,48 +1,29 @@
-//! Parsers, including the main parser and the parsers for the basic types
-//! (integer and string).
-//!
-//! ``BencodeParser`` is the main parser. It is generic over the type of the
-//! input buffer.
-pub mod error;
-pub mod integer;
-pub mod stack;
-pub mod string;
-
+//! Json generator for bencoded data.
+use core::str;
use std::{
fmt::Write as FmtWrite,
- io::{self, Read, Write as IoWrite},
+ io::{Read, Write as IoWrite},
};
-use derive_more::derive::Display;
-use error::{ReadContext, WriteContext};
-use stack::{Stack, State};
-
-use crate::rw::{
- byte_reader::ByteReader, byte_writer::ByteWriter, string_writer::StringWriter, writer::Writer,
+use super::{
+ stack::{Stack, State},
+ BencodeType,
};
+use tokenizer::{BencodeToken, Tokenizer};
-// Bencoded reserved bytes
-const BENCODE_BEGIN_INTEGER: u8 = b'i';
-const BENCODE_END_INTEGER: u8 = b'e';
-const BENCODE_BEGIN_LIST: u8 = b'l';
-const BENCODE_BEGIN_DICT: u8 = b'd';
-const BENCODE_END_LIST_OR_DICT: u8 = b'e';
-
-#[derive(Debug, PartialEq, Display)]
-pub enum BencodeType {
- Integer,
- String,
- List,
- Dict,
-}
+use crate::{
+ error::{self, ReadContext, WriteContext},
+ rw::{byte_writer::ByteWriter, string_writer::StringWriter, writer::Writer},
+ tokenizer,
+};
-pub struct BencodeParser {
- byte_reader: ByteReader,
+pub struct Generator {
+ tokenizer: Tokenizer,
num_processed_tokens: u64,
stack: Stack,
}
-impl BencodeParser {
+impl Generator {
const JSON_ARRAY_BEGIN: u8 = b'[';
const JSON_ARRAY_ITEMS_SEPARATOR: u8 = b',';
const JSON_ARRAY_END: u8 = b']';
@@ -53,8 +34,8 @@ impl BencodeParser {
const JSON_OBJ_END: u8 = b'}';
pub fn new(reader: R) -> Self {
- BencodeParser {
- byte_reader: ByteReader::new(reader),
+ Generator {
+ tokenizer: Tokenizer::new(reader),
num_processed_tokens: 1,
stack: Stack::default(),
}
@@ -104,49 +85,49 @@ impl BencodeParser {
/// - It can't read from the input or write to the output.
/// - The input is invalid Bencode.
fn parse(&mut self, writer: &mut W) -> Result<(), error::Error> {
- while let Some(peeked_byte) = Self::peek_byte(&mut self.byte_reader, writer)? {
- match peeked_byte {
- BENCODE_BEGIN_INTEGER => {
+ while let Some(token) = self.tokenizer.next_token()? {
+ match token {
+ BencodeToken::Integer(integer_bytes) => {
self.begin_bencoded_value(BencodeType::Integer, writer)?;
- integer::parse(&mut self.byte_reader, writer)?;
+ // todo: add `write_bytes` to writer.
+ for bytes in integer_bytes {
+ writer.write_byte(bytes)?;
+ }
}
- b'0'..=b'9' => {
+ BencodeToken::String(string_bytes) => {
self.begin_bencoded_value(BencodeType::String, writer)?;
- string::parse(&mut self.byte_reader, writer)?;
+
+ let html_tag_style_string = match str::from_utf8(&string_bytes) {
+ Ok(string) => {
+ // String only contains valid UTF-8 chars -> print it as it's
+ &format!("{}", string.to_owned())
+ }
+ Err(_) => {
+ // String contains non valid UTF-8 chars -> print it as hex bytes
+ &format!("{}", hex::encode(string_bytes))
+ }
+ };
+
+ writer.write_str(
+ &serde_json::to_string(&html_tag_style_string)
+ .expect("Failed to serialize to JSON. This should not happen because non UTF-8 bencoded string are serialized as hex bytes"),
+ )?;
}
- BENCODE_BEGIN_LIST => {
- let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
+ BencodeToken::BeginList => {
self.begin_bencoded_value(BencodeType::List, writer)?;
writer.write_byte(Self::JSON_ARRAY_BEGIN)?;
self.stack.push(State::ExpectingFirstListItemOrEnd);
}
- BENCODE_BEGIN_DICT => {
- let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
+ BencodeToken::BeginDict => {
self.begin_bencoded_value(BencodeType::Dict, writer)?;
writer.write_byte(Self::JSON_OBJ_BEGIN)?;
self.stack.push(State::ExpectingFirstDictFieldOrEnd);
}
- BENCODE_END_LIST_OR_DICT => {
- let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
+ BencodeToken::EndListOrDict => {
self.end_list_or_dict(writer)?;
}
- b'\n' => {
+ BencodeToken::LineBreak => {
// Ignore line breaks at the beginning, the end, or between values
- let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
- }
- _ => {
- return Err(error::Error::UnrecognizedFirstBencodeValueByte(
- ReadContext {
- byte: Some(peeked_byte),
- pos: self.byte_reader.input_byte_counter(),
- latest_bytes: self.byte_reader.captured_bytes(),
- },
- WriteContext {
- byte: Some(peeked_byte),
- pos: writer.output_byte_counter(),
- latest_bytes: writer.captured_bytes(),
- },
- ));
}
}
@@ -156,68 +137,6 @@ impl BencodeParser {
self.check_bad_end_stack_state(writer)
}
- /// It reads the next byte from the input consuming it. It returns `None` if
- /// the input has ended.
- ///
- /// # Errors
- ///
- /// Will return and errors if:
- ///
- /// - It can't read from the input.
- /// - The byte read is not the expected one (the previously peeked byte).
- fn read_peeked_byte(
- peeked_byte: u8,
- reader: &mut ByteReader,
- writer: &W,
- ) -> Result