From 9b3c42ba485ca5af91dc3ee2cee06e7f139ef64f Mon Sep 17 00:00:00 2001 From: qraqras Date: Mon, 16 Mar 2026 01:21:07 +0000 Subject: [PATCH 1/3] feat: support Google-style bracket entries in NumPy docstrings --- Cargo.toml | 2 +- src/parse/google/parser.rs | 131 ++++------------------ src/parse/numpy/parser.rs | 68 +++++++++++- src/parse/utils.rs | 215 +++++++++++++++++++++++++++++++++++++ tests/model.rs | 31 ++++++ tests/numpy/parameters.rs | 145 +++++++++++++++++++++++++ 6 files changed, 481 insertions(+), 111 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a0271ae..b563d32 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ readme = "README.md" keywords = ["python", "docstring", "parser", "google", "numpy"] categories = ["parser-implementations", "development-tools"] rust-version = "1.85" -exclude = [".devcontainer/", ".githooks/", ".github/", "target/", "tests/", "bindings/"] +exclude = [".devcontainer/", ".githooks/", ".github/", "bindings/", "target/", "tests/"] [lib] name = "pydocstring" diff --git a/src/parse/google/parser.rs b/src/parse/google/parser.rs index 18b8c8b..de2434d 100644 --- a/src/parse/google/parser.rs +++ b/src/parse/google/parser.rs @@ -5,7 +5,7 @@ use crate::cursor::{LineCursor, indent_len}; use crate::parse::google::kind::GoogleSectionKind; -use crate::parse::utils::{find_entry_colon, find_matching_close, split_comma_parts}; +use crate::parse::utils::{find_entry_colon, try_parse_bracket_entry}; use crate::syntax::{Parsed, SyntaxElement, SyntaxKind, SyntaxNode, SyntaxToken}; use crate::text::TextRange; @@ -25,42 +25,6 @@ fn extract_section_name(trimmed: &str) -> (&str, bool) { } } -// ============================================================================= -// Optional helpers -// ============================================================================= - -/// Strip a trailing `optional` marker from a type annotation. -/// -/// Uses bracket-aware comma splitting so that commas inside type -/// annotations like `Dict[str, int]` are never mistaken for the -/// separator before `optional`. -/// -/// Returns `(clean_type, optional_byte_offset)` where the offset is -/// relative to the start of `type_content` and points to the `o` in -/// `optional`. -fn strip_optional(type_content: &str) -> (&str, Option) { - let parts = split_comma_parts(type_content); - let mut optional_offset = None; - let mut type_end = 0; - - for &(seg_offset, seg_raw) in &parts { - let seg = seg_raw.trim(); - if seg == "optional" { - let ws_lead = seg_raw.len() - seg_raw.trim_start().len(); - optional_offset = Some(seg_offset + ws_lead); - } else if !seg.is_empty() { - type_end = seg_offset + seg_raw.trim_end().len(); - } - } - - if let Some(opt) = optional_offset { - let clean = type_content[..type_end].trim_end_matches(',').trim_end(); - (clean, Some(opt)) - } else { - (type_content, None) - } -} - // ============================================================================= // Entry header parsing // ============================================================================= @@ -89,43 +53,25 @@ fn parse_entry_header(cursor: &LineCursor, parse_type: bool) -> EntryHeader { let entry_start = cursor.substr_offset(trimmed); // --- Pattern 1: `name (type): desc` or `name(type): desc` --- - let bracket_pos = if parse_type { - trimmed.bytes().enumerate().find_map(|(i, b)| { - if (b == b'(' || b == b'[' || b == b'{' || b == b'<') && i > 0 { - Some(i) - } else { - None - } - }) - } else { - None - }; - - if let Some(rel_paren) = bracket_pos { - if let Some(rel_close) = find_matching_close(trimmed, rel_paren) { - let abs_paren = entry_start + rel_paren; - let abs_close = entry_start + rel_close; - - let name = trimmed[..rel_paren].trim_end(); - let name_span = TextRange::from_offset_len(entry_start, name.len()); - let open_bracket = TextRange::from_offset_len(abs_paren, 1); - let close_bracket = TextRange::from_offset_len(abs_close, 1); - - let type_raw = &trimmed[rel_paren + 1..rel_close]; - let type_trimmed = type_raw.trim(); - let leading_ws = type_raw.len() - type_raw.trim_start().len(); - let type_start = abs_paren + 1 + leading_ws; - - let (clean_type, opt_rel) = strip_optional(type_trimmed); - let opt_span = - opt_rel.map(|r| TextRange::from_offset_len(type_start + r, "optional".len())); - - let type_span = if !clean_type.is_empty() { - Some(TextRange::from_offset_len(type_start, clean_type.len())) + if parse_type { + if let Some(entry) = try_parse_bracket_entry(trimmed) { + let name_span = TextRange::from_offset_len(entry_start, entry.name.len()); + let open_bracket = TextRange::from_offset_len(entry_start + entry.open_bracket, 1); + let close_bracket = TextRange::from_offset_len(entry_start + entry.close_bracket, 1); + + let type_span = if !entry.clean_type.is_empty() { + Some(TextRange::from_offset_len( + entry_start + entry.type_offset, + entry.clean_type.len(), + )) } else { None }; + let opt_span = entry + .optional_offset + .map(|r| TextRange::from_offset_len(entry_start + r, "optional".len())); + let type_info = Some(TypeInfo { open_bracket, r#type: type_span, @@ -133,8 +79,12 @@ fn parse_entry_header(cursor: &LineCursor, parse_type: bool) -> EntryHeader { optional: opt_span, }); - let after_close = &trimmed[rel_close + 1..]; - let (first_description, colon) = extract_desc_after_colon(after_close, abs_close + 1); + let colon = entry + .colon + .map(|c| TextRange::from_offset_len(entry_start + c, 1)); + let first_description = entry.description_offset.map(|d| { + TextRange::from_offset_len(entry_start + d, entry.description.unwrap().len()) + }); let range_end = if let Some(ref desc) = first_description { desc.end() @@ -193,29 +143,6 @@ fn parse_entry_header(cursor: &LineCursor, parse_type: bool) -> EntryHeader { } } -/// Extract description and colon spans after the closing bracket. -fn extract_desc_after_colon( - after_paren: &str, - base_offset: usize, -) -> (Option, Option) { - let stripped = after_paren.trim_start(); - if let Some(after_colon) = stripped.strip_prefix(':') { - let desc = after_colon.trim_start(); - let leading_to_stripped = after_paren.len() - stripped.len(); - let leading_after_colon = after_colon.len() - desc.len(); - let colon_abs = base_offset + leading_to_stripped; - let desc_start = colon_abs + 1 + leading_after_colon; - let desc_range = if desc.is_empty() { - None - } else { - Some(TextRange::from_offset_len(desc_start, desc.len())) - }; - (desc_range, Some(TextRange::from_offset_len(colon_abs, 1))) - } else { - (None, None) - } -} - // ============================================================================= // Section header parsing // ============================================================================= @@ -1070,18 +997,4 @@ mod tests { assert_eq!(ti.r#type.unwrap().source_text(src), "Dict[str, int]"); assert_eq!(header.first_description.unwrap().source_text(src), "Values"); } - - #[test] - fn test_strip_optional_basic() { - assert_eq!(strip_optional("int, optional"), ("int", Some(5))); - assert_eq!(strip_optional("int"), ("int", None)); - assert_eq!( - strip_optional("Dict[str, int], optional"), - ("Dict[str, int]", Some(16)) - ); - assert_eq!(strip_optional("optional"), ("", Some(0))); - assert_eq!(strip_optional("int,optional"), ("int", Some(4))); - assert_eq!(strip_optional("int, optional"), ("int", Some(6))); - assert_eq!(strip_optional("int, optional "), ("int", Some(5))); - } } diff --git a/src/parse/numpy/parser.rs b/src/parse/numpy/parser.rs index d1d6e93..3c75917 100644 --- a/src/parse/numpy/parser.rs +++ b/src/parse/numpy/parser.rs @@ -5,7 +5,9 @@ use crate::cursor::{LineCursor, indent_columns, indent_len}; use crate::parse::numpy::kind::NumPySectionKind; -use crate::parse::utils::{find_entry_colon, find_matching_close, split_comma_parts}; +use crate::parse::utils::{ + find_entry_colon, find_matching_close, split_comma_parts, try_parse_bracket_entry, +}; use crate::syntax::{Parsed, SyntaxElement, SyntaxKind, SyntaxNode, SyntaxToken}; use crate::text::TextRange; @@ -104,6 +106,7 @@ struct ParamHeaderParts { default_keyword: Option, default_separator: Option, default_value: Option, + first_description: Option, } fn parse_name_and_type( @@ -112,6 +115,11 @@ fn parse_name_and_type( col_base: usize, cursor: &LineCursor, ) -> ParamHeaderParts { + // --- Google-style bracket pattern: `name (type): desc` --- + if let Some(result) = try_parse_google_style_entry(text, line_idx, col_base, cursor) { + return result; + } + let Some(colon_pos) = find_entry_colon(text) else { let names = parse_name_list(text, line_idx, col_base, cursor); return ParamHeaderParts { @@ -122,6 +130,7 @@ fn parse_name_and_type( default_keyword: None, default_separator: None, default_value: None, + first_description: None, }; }; @@ -142,6 +151,7 @@ fn parse_name_and_type( default_keyword: None, default_separator: None, default_value: None, + first_description: None, }; } @@ -215,9 +225,59 @@ fn parse_name_and_type( default_keyword, default_separator, default_value, + first_description: None, } } +/// Try to parse a Google-style entry `name (type): desc` or `name(type): desc`. +/// +/// Returns `Some(ParamHeaderParts)` when the line matches the bracket-style +/// pattern. Otherwise returns `None` so that the caller falls through to +/// the normal NumPy parsing path. +fn try_parse_google_style_entry( + text: &str, + line_idx: usize, + col_base: usize, + cursor: &LineCursor, +) -> Option { + let entry = try_parse_bracket_entry(text)?; + + let names = parse_name_list(entry.name, line_idx, col_base, cursor); + + let param_type = if !entry.clean_type.is_empty() { + Some(cursor.make_line_range( + line_idx, + col_base + entry.type_offset, + entry.clean_type.len(), + )) + } else { + None + }; + + let optional = entry + .optional_offset + .map(|r| cursor.make_line_range(line_idx, col_base + r, "optional".len())); + + let colon = entry + .colon + .map(|c| cursor.make_line_range(line_idx, col_base + c, 1)); + + let first_description = entry + .description_offset + .map(|d| cursor.make_line_range(line_idx, col_base + d, entry.description.unwrap().len())); + + Some(ParamHeaderParts { + names, + colon, + param_type, + optional, + default_keyword: None, + default_separator: None, + default_value: None, + first_description, + }) +} + fn parse_name_list( text: &str, line_idx: usize, @@ -293,6 +353,12 @@ fn build_parameter_node(parts: &ParamHeaderParts, range: TextRange) -> SyntaxNod dv, ))); } + if let Some(desc) = parts.first_description { + children.push(SyntaxElement::Token(SyntaxToken::new( + SyntaxKind::DESCRIPTION, + desc, + ))); + } SyntaxNode::new(SyntaxKind::NUMPY_PARAMETER, range, children) } diff --git a/src/parse/utils.rs b/src/parse/utils.rs index 09b8b17..fbc4008 100644 --- a/src/parse/utils.rs +++ b/src/parse/utils.rs @@ -71,10 +71,144 @@ pub(crate) fn find_matching_close(s: &str, open_pos: usize) -> Option { None } +// ============================================================================= +// Optional marker stripping +// ============================================================================= + +/// Strip a trailing `optional` marker from a type annotation. +/// +/// Uses bracket-aware comma splitting so that commas inside type +/// annotations like `Dict[str, int]` are never mistaken for the +/// separator before `optional`. +/// +/// Returns `(clean_type, optional_byte_offset)` where the offset is +/// relative to the start of `type_content` and points to the `o` in +/// `optional`. +pub(crate) fn strip_optional(type_content: &str) -> (&str, Option) { + let parts = split_comma_parts(type_content); + let mut optional_offset = None; + let mut type_end = 0; + + for &(seg_offset, seg_raw) in &parts { + let seg = seg_raw.trim(); + if seg == "optional" { + let ws_lead = seg_raw.len() - seg_raw.trim_start().len(); + optional_offset = Some(seg_offset + ws_lead); + } else if !seg.is_empty() { + type_end = seg_offset + seg_raw.trim_end().len(); + } + } + + if let Some(opt) = optional_offset { + let clean = type_content[..type_end].trim_end_matches(',').trim_end(); + (clean, Some(opt)) + } else { + (type_content, None) + } +} + +// ============================================================================= +// Bracket-style entry parsing +// ============================================================================= + +/// Parsed byte-offset information for a bracket-style entry: `name(type): desc`. +/// +/// All byte offsets are relative to the start of the input `text`. +pub(crate) struct BracketEntry<'a> { + /// Name text before the bracket (end-trimmed). + pub name: &'a str, + /// Byte offset of the opening bracket. + pub open_bracket: usize, + /// Byte offset of the closing bracket. + pub close_bracket: usize, + /// Clean type text (optional stripped) inside brackets. + pub clean_type: &'a str, + /// Byte offset of the type text start. + pub type_offset: usize, + /// Byte offset of `optional` keyword, if present. + pub optional_offset: Option, + /// Byte offset of the colon after the close bracket, if present. + pub colon: Option, + /// Description text after the colon (trimmed), if present. + pub description: Option<&'a str>, + /// Byte offset of the description start, if present. + pub description_offset: Option, +} + +/// Try to parse a bracket-style entry `name(type): desc` or `name (type): desc`. +/// +/// Returns `Some(BracketEntry)` when a bracket appears before the first +/// top-level colon and has a matching close, followed by `:` or end-of-text. +/// Returns `None` otherwise, so the caller can fall through to other patterns. +pub(crate) fn try_parse_bracket_entry(text: &str) -> Option> { + // Find the first opening bracket that comes after at least one character. + let bracket_pos = text.bytes().enumerate().find_map(|(i, b)| { + if i > 0 && matches!(b, b'(' | b'[' | b'{' | b'<') { + Some(i) + } else { + None + } + })?; + + // The bracket must appear before any top-level colon. + if let Some(colon_pos) = find_entry_colon(text) { + if colon_pos < bracket_pos { + return None; + } + } + + let close_pos = find_matching_close(text, bracket_pos)?; + + // After the closing bracket there must be `:` (with optional whitespace) + // or end-of-text. + let after_close = text[close_pos + 1..].trim_start(); + if !after_close.is_empty() && !after_close.starts_with(':') { + return None; + } + + let name = text[..bracket_pos].trim_end(); + + let type_raw = &text[bracket_pos + 1..close_pos]; + let type_trimmed = type_raw.trim(); + let leading_ws = type_raw.len() - type_raw.trim_start().len(); + let type_offset = bracket_pos + 1 + leading_ws; + + let (clean_type, opt_rel) = strip_optional(type_trimmed); + let optional_offset = opt_rel.map(|r| type_offset + r); + + let (colon, description, description_offset) = if after_close.starts_with(':') { + let colon_byte = text[close_pos + 1..].find(':').unwrap() + close_pos + 1; + let after_colon = &text[colon_byte + 1..]; + let desc = after_colon.trim(); + if desc.is_empty() { + (Some(colon_byte), None, None) + } else { + let ws = after_colon.len() - after_colon.trim_start().len(); + (Some(colon_byte), Some(desc), Some(colon_byte + 1 + ws)) + } + } else { + (None, None, None) + }; + + Some(BracketEntry { + name, + open_bracket: bracket_pos, + close_bracket: close_pos, + clean_type, + type_offset, + optional_offset, + colon, + description, + description_offset, + }) +} + #[cfg(test)] mod tests { use super::*; + // ---- find_entry_colon ---- + #[test] fn test_find_entry_colon() { // Basic colon @@ -143,4 +277,85 @@ mod tests { fn test_find_matching_close_angle_brackets() { assert_eq!(find_matching_close("", 0), Some(4)); } + + // ---- strip_optional ---- + + #[test] + fn test_strip_optional_basic() { + assert_eq!(strip_optional("int, optional"), ("int", Some(5))); + assert_eq!(strip_optional("int"), ("int", None)); + assert_eq!( + strip_optional("Dict[str, int], optional"), + ("Dict[str, int]", Some(16)) + ); + assert_eq!(strip_optional("optional"), ("", Some(0))); + assert_eq!(strip_optional("int,optional"), ("int", Some(4))); + assert_eq!(strip_optional("int, optional"), ("int", Some(6))); + assert_eq!(strip_optional("int, optional "), ("int", Some(5))); + } + + // ---- try_parse_bracket_entry ---- + + #[test] + fn test_bracket_entry_basic() { + let e = try_parse_bracket_entry("name (int): desc").unwrap(); + assert_eq!(e.name, "name"); + assert_eq!(e.clean_type, "int"); + assert_eq!(e.description, Some("desc")); + } + + #[test] + fn test_bracket_entry_no_space() { + let e = try_parse_bracket_entry("name(int): desc").unwrap(); + assert_eq!(e.name, "name"); + assert_eq!(e.clean_type, "int"); + } + + #[test] + fn test_bracket_entry_optional() { + let e = try_parse_bracket_entry("name (int, optional): desc").unwrap(); + assert_eq!(e.clean_type, "int"); + assert!(e.optional_offset.is_some()); + } + + #[test] + fn test_bracket_entry_complex_type() { + let e = try_parse_bracket_entry("data (Dict[str, int]): values").unwrap(); + assert_eq!(e.clean_type, "Dict[str, int]"); + assert_eq!(e.description, Some("values")); + } + + #[test] + fn test_bracket_entry_no_colon() { + let e = try_parse_bracket_entry("name (int)").unwrap(); + assert_eq!(e.name, "name"); + assert_eq!(e.clean_type, "int"); + assert!(e.colon.is_none()); + assert!(e.description.is_none()); + } + + #[test] + fn test_bracket_entry_empty_desc() { + let e = try_parse_bracket_entry("name (int):").unwrap(); + assert_eq!(e.clean_type, "int"); + assert!(e.colon.is_some()); + assert!(e.description.is_none()); + } + + #[test] + fn test_bracket_entry_colon_before_bracket() { + // `name : (int)` should NOT match — colon is before bracket. + assert!(try_parse_bracket_entry("name : (int)").is_none()); + } + + #[test] + fn test_bracket_entry_no_bracket() { + assert!(try_parse_bracket_entry("name : int").is_none()); + } + + #[test] + fn test_bracket_entry_text_after_bracket() { + // `name (int) not_colon` — non-colon text after bracket. + assert!(try_parse_bracket_entry("name (int) not_colon").is_none()); + } } diff --git a/tests/model.rs b/tests/model.rs index 802a344..e02719b 100644 --- a/tests/model.rs +++ b/tests/model.rs @@ -367,3 +367,34 @@ fn same_ir_from_both_styles() { _ => panic!("both should be Parameters sections"), } } + +// ============================================================================= +// NumPy IR: Google-style entries in NumPy sections +// ============================================================================= + +#[test] +fn numpy_google_style_entry_to_model() { + let parsed = parse_numpy("Summary.\n\nParameters\n----------\nname (str): The name.\n"); + let doc = numpy_to_model(&parsed).unwrap(); + let params = match &doc.sections[0] { + Section::Parameters(p) => p, + _ => panic!("expected Parameters"), + }; + assert_eq!(params.len(), 1); + assert_eq!(params[0].names, vec!["name"]); + assert_eq!(params[0].type_annotation.as_deref(), Some("str")); + assert_eq!(params[0].description.as_deref(), Some("The name.")); +} + +#[test] +fn numpy_google_style_optional_to_model() { + let parsed = + parse_numpy("Summary.\n\nParameters\n----------\nname (str, optional): The name.\n"); + let doc = numpy_to_model(&parsed).unwrap(); + let params = match &doc.sections[0] { + Section::Parameters(p) => p, + _ => panic!("expected Parameters"), + }; + assert_eq!(params[0].type_annotation.as_deref(), Some("str")); + assert!(params[0].is_optional); +} diff --git a/tests/numpy/parameters.rs b/tests/numpy/parameters.rs index e43e127..d78b442 100644 --- a/tests/numpy/parameters.rs +++ b/tests/numpy/parameters.rs @@ -430,3 +430,148 @@ fn test_receives_section_body_variant() { let params: Vec<_> = s.parameters().collect(); assert_eq!(params.len(), 1); } + +// ============================================================================= +// Google-style entry format in NumPy sections +// ============================================================================= + +#[test] +fn test_google_style_entry_in_numpy_section() { + let docstring = "Summary.\n\nParameters\n----------\nname (str): The name.\n"; + let result = parse_numpy(docstring); + let params = parameters(&result); + assert_eq!(params.len(), 1); + + let names: Vec<_> = params[0].names().collect(); + assert_eq!(names[0].text(result.source()), "name"); + assert_eq!( + params[0].r#type().map(|t| t.text(result.source())), + Some("str") + ); + assert_eq!( + params[0].description().map(|t| t.text(result.source())), + Some("The name.") + ); +} + +#[test] +fn test_google_style_entry_with_optional() { + let docstring = "Summary.\n\nParameters\n----------\nname (str, optional): The name.\n"; + let result = parse_numpy(docstring); + let params = parameters(&result); + assert_eq!(params.len(), 1); + + let names: Vec<_> = params[0].names().collect(); + assert_eq!(names[0].text(result.source()), "name"); + assert_eq!( + params[0].r#type().map(|t| t.text(result.source())), + Some("str") + ); + assert!(params[0].optional().is_some()); + assert_eq!( + params[0].description().map(|t| t.text(result.source())), + Some("The name.") + ); +} + +#[test] +fn test_google_style_entry_no_description() { + let docstring = "Summary.\n\nParameters\n----------\nname (int):\n"; + let result = parse_numpy(docstring); + let params = parameters(&result); + assert_eq!(params.len(), 1); + + let names: Vec<_> = params[0].names().collect(); + assert_eq!(names[0].text(result.source()), "name"); + assert_eq!( + params[0].r#type().map(|t| t.text(result.source())), + Some("int") + ); + assert!(params[0].description().is_none()); +} + +#[test] +fn test_google_style_entry_with_continuation() { + let docstring = + "Summary.\n\nParameters\n----------\nname (str): The name.\n Continued here.\n"; + let result = parse_numpy(docstring); + let params = parameters(&result); + assert_eq!(params.len(), 1); + + assert_eq!( + params[0].r#type().map(|t| t.text(result.source())), + Some("str") + ); + assert_eq!( + params[0].description().map(|t| t.text(result.source())), + Some("The name.\n Continued here.") + ); +} + +#[test] +fn test_google_style_entry_complex_type() { + let docstring = "Summary.\n\nParameters\n----------\ndata (Dict[str, int]): The mapping.\n"; + let result = parse_numpy(docstring); + let params = parameters(&result); + assert_eq!(params.len(), 1); + + assert_eq!( + params[0].r#type().map(|t| t.text(result.source())), + Some("Dict[str, int]") + ); + assert_eq!( + params[0].description().map(|t| t.text(result.source())), + Some("The mapping.") + ); +} + +#[test] +fn test_google_style_mixed_with_numpy_style() { + let docstring = "Summary.\n\nParameters\n----------\nx (int): First.\ny : str\n Second.\n"; + let result = parse_numpy(docstring); + let params = parameters(&result); + assert_eq!(params.len(), 2); + + // Google-style entry + assert_eq!(params[0].names().next().unwrap().text(result.source()), "x"); + assert_eq!( + params[0].r#type().map(|t| t.text(result.source())), + Some("int") + ); + assert_eq!( + params[0].description().map(|t| t.text(result.source())), + Some("First.") + ); + + // NumPy-style entry + assert_eq!(params[1].names().next().unwrap().text(result.source()), "y"); + assert_eq!( + params[1].r#type().map(|t| t.text(result.source())), + Some("str") + ); + assert_eq!( + params[1].description().map(|t| t.text(result.source())), + Some("Second.") + ); +} + +#[test] +fn test_google_style_entry_no_colon_after_bracket() { + let docstring = "Summary.\n\nParameters\n----------\nname (int)\n Desc.\n"; + let result = parse_numpy(docstring); + let params = parameters(&result); + assert_eq!(params.len(), 1); + + assert_eq!( + params[0].names().next().unwrap().text(result.source()), + "name" + ); + assert_eq!( + params[0].r#type().map(|t| t.text(result.source())), + Some("int") + ); + assert_eq!( + params[0].description().map(|t| t.text(result.source())), + Some("Desc.") + ); +} From ccc80379b8d9cf348601aa6d138af6caf0bb67f4 Mon Sep 17 00:00:00 2001 From: qraqras Date: Mon, 16 Mar 2026 01:46:41 +0000 Subject: [PATCH 2/3] feat: add lineno/column position lookup for syntax tokens --- bindings/python/Cargo.lock | 4 +- bindings/python/pydocstring.pyi | 25 ++++++ bindings/python/src/lib.rs | 73 ++++++++++++++++ bindings/python/tests/test_pydocstring.py | 100 ++++++++++++++++++++++ src/syntax.rs | 17 +++- src/text.rs | 64 ++++++++++++++ 6 files changed, 279 insertions(+), 4 deletions(-) diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index ae4e27c..ae1277b 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -67,11 +67,11 @@ dependencies = [ [[package]] name = "pydocstring" -version = "0.1.0" +version = "0.1.1" [[package]] name = "pydocstring-python" -version = "0.1.0" +version = "0.1.1" dependencies = [ "pydocstring", "pyo3", diff --git a/bindings/python/pydocstring.pyi b/bindings/python/pydocstring.pyi index d21a1ae..84e7396 100644 --- a/bindings/python/pydocstring.pyi +++ b/bindings/python/pydocstring.pyi @@ -63,6 +63,17 @@ class TextRange: @property def end(self) -> int: ... +class LineColumn: + """A line/column position in the source text. + + ``lineno`` is 1-based; ``col`` is 0-based and counted in Unicode + codepoints, matching Python's ``ast`` module convention. + """ + @property + def lineno(self) -> int: ... + @property + def col(self) -> int: ... + class Token: @property def kind(self) -> SyntaxKind: ... @@ -130,6 +141,13 @@ class GoogleDocstring: def source(self) -> str: ... def pretty_print(self) -> str: ... def to_model(self) -> Docstring: ... + def line_col(self, offset: int) -> LineColumn: + """Convert a byte offset to a LineColumn. + + The offset is typically ``token.range.start`` or ``token.range.end``. + ``lineno`` is 1-based; ``col`` is 0-based Unicode codepoints. + """ + ... # ─── NumPy ─────────────────────────────────────────────────────────────────── @@ -186,6 +204,13 @@ class NumPyDocstring: def source(self) -> str: ... def pretty_print(self) -> str: ... def to_model(self) -> Docstring: ... + def line_col(self, offset: int) -> LineColumn: + """Convert a byte offset to a LineColumn. + + The offset is typically ``token.range.start`` or ``token.range.end``. + ``lineno`` is 1-based; ``col`` is 0-based Unicode codepoints. + """ + ... # ─── Model IR ──────────────────────────────────────────────────────────────── diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index e4e4268..eeefeed 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -39,6 +39,60 @@ impl From<&TextRange> for PyTextRange { } } +// ─── LineColumn ───────────────────────────────────────────────────────────── + +/// A line/column position in the source text. +/// +/// `lineno` is 1-based; `col` is the 0-based Unicode codepoint offset from +/// the start of the line (compatible with Python's `ast` module). +#[pyclass(name = "LineColumn", frozen)] +struct PyLineColumn { + #[pyo3(get)] + lineno: u32, + #[pyo3(get)] + col: u32, +} + +#[pymethods] +impl PyLineColumn { + fn __repr__(&self) -> String { + format!("LineColumn(lineno={}, col={})", self.lineno, self.col) + } +} + +/// Convert a byte offset into a `PyLineColumn` with codepoint-based `col`. +/// +/// Returns an error if `byte_offset` is beyond the source length or does not +/// land on a UTF-8 character boundary. +fn byte_offset_to_line_col(source: &str, byte_offset: usize) -> PyResult { + if byte_offset > source.len() { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "offset {} is out of bounds (source length: {})", + byte_offset, + source.len() + ))); + } + let mut lineno = 1u32; + let mut line_start = 0usize; + for (i, b) in source.bytes().enumerate() { + if i >= byte_offset { + break; + } + if b == b'\n' { + lineno += 1; + line_start = i + 1; + } + } + // Verify the offset falls on a char boundary before slicing. + if !source.is_char_boundary(byte_offset) || !source.is_char_boundary(line_start) { + return Err(pyo3::exceptions::PyValueError::new_err( + "offset is not on a UTF-8 character boundary", + )); + } + let col = source[line_start..byte_offset].chars().count() as u32; + Ok(PyLineColumn { lineno, col }) +} + // ─── SyntaxKind ────────────────────────────────────────────────────────────── /// Syntax node/token kind enum. @@ -469,6 +523,15 @@ impl PyGoogleDocstring { .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("failed to convert to model"))?; Ok(PyModelDocstring { inner: doc }) } + /// Convert a byte offset to a `LineColumn` with codepoint-based `col`. + /// + /// The offset is typically obtained from `Token.range.start` or + /// `Token.range.end`. `lineno` is 1-based; `col` is 0-based and counted + /// in Unicode codepoints, matching Python's `ast` module convention. + fn line_col(&self, py: Python<'_>, offset: u32) -> PyResult> { + let lc = byte_offset_to_line_col(&self.source, offset as usize)?; + Py::new(py, lc) + } fn __repr__(&self) -> String { "GoogleDocstring(...)".to_string() } @@ -640,6 +703,15 @@ impl PyNumPyDocstring { .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("failed to convert to model"))?; Ok(PyModelDocstring { inner: doc }) } + /// Convert a byte offset to a `LineColumn` with codepoint-based `col`. + /// + /// The offset is typically obtained from `Token.range.start` or + /// `Token.range.end`. `lineno` is 1-based; `col` is 0-based and counted + /// in Unicode codepoints, matching Python's `ast` module convention. + fn line_col(&self, py: Python<'_>, offset: u32) -> PyResult> { + let lc = byte_offset_to_line_col(&self.source, offset as usize)?; + Py::new(py, lc) + } fn __repr__(&self) -> String { "NumPyDocstring(...)".to_string() } @@ -1607,6 +1679,7 @@ fn pydocstring(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/bindings/python/tests/test_pydocstring.py b/bindings/python/tests/test_pydocstring.py index 402face..6471397 100644 --- a/bindings/python/tests/test_pydocstring.py +++ b/bindings/python/tests/test_pydocstring.py @@ -419,6 +419,106 @@ def test_convert_google_to_numpy(self): assert "----------" in numpy_text assert "x : int" in numpy_text + +class TestLineCol: + """Tests for GoogleDocstring.line_col() and NumPyDocstring.line_col().""" + + # ── Google ─────────────────────────────────────────────────────────────── + + def test_google_summary_first_line(self): + doc = pydocstring.parse_google("Summary.") + lc = doc.line_col(doc.summary.range.start) + assert lc.lineno == 1 + assert lc.col == 0 + + def test_google_arg_name_lineno(self): + src = "Summary.\n\nArgs:\n x (int): Value." + doc = pydocstring.parse_google(src) + arg = doc.sections[0].args[0] + lc = doc.line_col(arg.name.range.start) + assert lc.lineno == 4 # " x (int): Value." is on line 4 + assert lc.col == 4 # 4 spaces of indentation + + def test_google_col_is_codepoints_not_bytes(self): + # "α" is 2 bytes in UTF-8 but 1 codepoint. + # Source: "α.\n\nArgs:\n x: V." + # "x" starts at byte 4+4=... let's compute: + # line 1: "α.\n" → α=2bytes, .=1, \n=1 → line_start line4 = 2+1+1+1 = 5 + # line 2: "\n" → 1byte + # line 3: "Args:\n" → 6bytes + # line 4: " x: V.\n" → " x" starts with 4 spaces + x + # byte of "x" in line4 = 5+1+6+4 = 16 + src = "α.\n\nArgs:\n x: V." + doc = pydocstring.parse_google(src) + arg = doc.sections[0].args[0] + lc = doc.line_col(arg.name.range.start) + assert lc.lineno == 4 + assert lc.col == 4 # 4 spaces → 4 codepoints (bytes == codepoints here) + + def test_google_multibyte_col(self): + # Line with multibyte chars before the token. + # "αβ: int" as the summary — check col of "int" token text + # α=2bytes, β=2bytes, :=1, space=1 → "int" starts at byte 6 + # but codepoints: α=1, β=1, :=1, space=1 → col=4 + src = "αβ: int" + doc = pydocstring.parse_google(src) + # The whole line is treated as summary; check that line_col at byte 6 + # returns col 4 (codepoints), not 6 (bytes) + lc = doc.line_col(6) + assert lc.lineno == 1 + assert lc.col == 4 + + def test_google_multiline_lineno(self): + src = "Summary.\n\nExtended.\n\nArgs:\n x: V." + doc = pydocstring.parse_google(src) + arg = doc.sections[0].args[0] + lc = doc.line_col(arg.name.range.start) + assert lc.lineno == 6 + + def test_google_returns_class(self): + lc = pydocstring.parse_google("S.").line_col(0) + assert isinstance(lc, pydocstring.LineColumn) + + def test_google_out_of_bounds(self): + import pytest + doc = pydocstring.parse_google("S.") + with pytest.raises(Exception): + doc.line_col(9999) + + # ── NumPy ──────────────────────────────────────────────────────────────── + + def test_numpy_summary_first_line(self): + doc = pydocstring.parse_numpy("Summary.") + lc = doc.line_col(doc.summary.range.start) + assert lc.lineno == 1 + assert lc.col == 0 + + def test_numpy_param_name_lineno(self): + src = "Summary.\n\nParameters\n----------\nx : int\n Desc." + doc = pydocstring.parse_numpy(src) + param = doc.sections[0].parameters[0] + lc = doc.line_col(param.names[0].range.start) + assert lc.lineno == 5 # "x : int" is on line 5 + assert lc.col == 0 + + def test_numpy_multibyte_col(self): + # Same multibyte check for NumPy path + src = "αβ: int" + doc = pydocstring.parse_numpy(src) + lc = doc.line_col(6) + assert lc.lineno == 1 + assert lc.col == 4 + + def test_numpy_returns_class(self): + lc = pydocstring.parse_numpy("S.").line_col(0) + assert isinstance(lc, pydocstring.LineColumn) + + def test_numpy_out_of_bounds(self): + import pytest + doc = pydocstring.parse_numpy("S.") + with pytest.raises(Exception): + doc.line_col(9999) + def test_emit_free_text_section(self): doc = pydocstring.Docstring( summary="Brief.", diff --git a/src/syntax.rs b/src/syntax.rs index c43d67c..786a6dc 100644 --- a/src/syntax.rs +++ b/src/syntax.rs @@ -10,7 +10,7 @@ use core::fmt; use core::fmt::Write; -use crate::text::TextRange; +use crate::text::{LineColumn, LineIndex, TextRange}; // ============================================================================= // SyntaxKind @@ -426,12 +426,18 @@ impl SyntaxElement { pub struct Parsed { source: String, root: SyntaxNode, + line_index: LineIndex, } impl Parsed { /// Creates a new `Parsed` from source text and root node. pub fn new(source: String, root: SyntaxNode) -> Self { - Self { source, root } + let line_index = LineIndex::new(&source); + Self { + source, + root, + line_index, + } } /// The full source text. @@ -444,6 +450,13 @@ impl Parsed { &self.root } + /// Convert a byte offset to a [`LineColumn`] position. + /// + /// `lineno` is 1-based; `col` is the 0-based byte column within the line. + pub fn line_col(&self, offset: crate::text::TextSize) -> LineColumn { + self.line_index.line_col(offset) + } + /// Produce a Biome-style pretty-printed representation of the tree. pub fn pretty_print(&self) -> String { let mut out = String::new(); diff --git a/src/text.rs b/src/text.rs index 45683f5..47906f9 100644 --- a/src/text.rs +++ b/src/text.rs @@ -149,3 +149,67 @@ impl fmt::Display for TextRange { write!(f, "{}..{}", self.start, self.end) } } + +// ============================================================================= +// LineColumn +// ============================================================================= + +/// A line/column position in the source text. +/// +/// `lineno` is 1-based; `col` is the 0-based byte offset from the start of +/// the line. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct LineColumn { + /// 1-based line number. + pub lineno: u32, + /// 0-based byte column offset from the start of the line. + pub col: u32, +} + +impl fmt::Display for LineColumn { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}:{}", self.lineno, self.col) + } +} + +// ============================================================================= +// LineIndex +// ============================================================================= + +/// A lookup table for converting byte offsets to [`LineColumn`] positions. +/// +/// Build once from the source text with [`LineIndex::new`], then call +/// [`LineIndex::line_col`] for any [`TextSize`] offset. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct LineIndex { + /// Byte offset of the first character of each line. + /// `line_starts[0]` is always 0 (start of the first line). + line_starts: Vec, +} + +impl LineIndex { + /// Build a `LineIndex` from the source text. + pub fn new(source: &str) -> Self { + let mut line_starts = vec![0u32]; + for (i, b) in source.bytes().enumerate() { + if b == b'\n' { + line_starts.push((i + 1) as u32); + } + } + Self { line_starts } + } + + /// Convert a byte offset to a [`LineColumn`] position. + /// + /// `lineno` is 1-based; `col` is the 0-based byte offset within the line. + pub fn line_col(&self, offset: TextSize) -> LineColumn { + let offset = offset.raw(); + // The index of the last line that starts at or before `offset`. + let line = self.line_starts.partition_point(|&s| s <= offset) - 1; + let col = offset - self.line_starts[line]; + LineColumn { + lineno: line as u32 + 1, + col, + } + } +} From 1961dbab98d0d838952552f77f73e80d0c7b0dab Mon Sep 17 00:00:00 2001 From: qraqras Date: Mon, 16 Mar 2026 01:52:30 +0000 Subject: [PATCH 3/3] docs: v0.1.2 --- CHANGELOG.md | 19 +++++++++++++++++++ Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 2 +- bindings/python/Cargo.lock | 4 ++-- bindings/python/Cargo.toml | 4 ++-- bindings/python/pyproject.toml | 2 +- 7 files changed, 27 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fca9cce..f19dc49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.2] - 2026-03-16 + +### Added + +- `LineColumn` struct (`lineno`, `col`) in `text.rs` for representing + line/column positions; `lineno` is 1-based, `col` is a 0-based byte offset + within the line. +- `LineIndex` in `text.rs` — a newline-offset lookup table built from source + text; converts any `TextSize` byte offset to `LineColumn` in O(log n). +- `Parsed::line_col(offset: TextSize) -> LineColumn` method for resolving + byte offsets in the syntax tree to line/column positions. +- Python bindings: `LineColumn` class with `lineno` and `col` properties. + `col` is expressed in **Unicode codepoints** (compatible with Python's + `ast` module convention) rather than raw bytes. +- Python bindings: `GoogleDocstring.line_col(offset)` and + `NumPyDocstring.line_col(offset)` methods; `offset` is typically obtained + from `Token.range.start` or `Token.range.end`. + ## [0.1.1] - 2026-03-10 ### Added @@ -48,5 +66,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Zero external crate dependencies - Python bindings via PyO3 (`pydocstring-rs`) +[0.1.2]: https://github.com/qraqras/pydocstring/compare/v0.1.1...v0.1.2 [0.1.1]: https://github.com/qraqras/pydocstring/compare/v0.1.0...v0.1.1 [0.1.0]: https://github.com/qraqras/pydocstring/releases/tag/v0.1.0 diff --git a/Cargo.lock b/Cargo.lock index 0d4dbd2..376fa45 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,4 +4,4 @@ version = 4 [[package]] name = "pydocstring" -version = "0.1.1" +version = "0.1.2" diff --git a/Cargo.toml b/Cargo.toml index b563d32..a08644b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pydocstring" -version = "0.1.1" +version = "0.1.2" edition = "2024" authors = ["Ryuma Asai"] description = "A zero-dependency Rust parser for Python docstrings (Google and NumPy styles) with a unified syntax tree and byte-precise source locations" diff --git a/README.md b/README.md index 35b8272..e2a0310 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Python bindings are also available as [`pydocstring-rs`](https://pypi.org/projec ```toml [dependencies] -pydocstring = "0.1.1" +pydocstring = "0.1.2" ``` ## Usage diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index ae1277b..25605ea 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -67,11 +67,11 @@ dependencies = [ [[package]] name = "pydocstring" -version = "0.1.1" +version = "0.1.2" [[package]] name = "pydocstring-python" -version = "0.1.1" +version = "0.1.2" dependencies = [ "pydocstring", "pyo3", diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index a19c15c..aa61ae6 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pydocstring-python" -version = "0.1.1" +version = "0.1.2" edition = "2024" authors = ["Ryuma Asai"] description = "Python bindings for pydocstring — a fast docstring parser for Google and NumPy styles" @@ -12,5 +12,5 @@ name = "pydocstring" crate-type = ["cdylib"] [dependencies] -pydocstring_core = { package = "pydocstring", version = "0.1.1", path = "../.." } +pydocstring_core = { package = "pydocstring", version = "0.1.2", path = "../.." } pyo3 = { version = "0.24", features = ["extension-module"] } diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 2675b22..8184332 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "pydocstring-rs" -version = "0.1.1" +version = "0.1.2" description = "Python bindings for pydocstring — a zero-dependency Rust parser for Python docstrings (Google and NumPy styles) with a unified syntax tree and byte-precise source locations" license = {text = "MIT"} authors = [{name = "Ryuma Asai"}]