From 0b27f28d0ae93f96822938e16fca08b9ff23975b Mon Sep 17 00:00:00 2001 From: OpenClaw AI Date: Wed, 11 Mar 2026 23:40:04 +0800 Subject: [PATCH] Fix: Extract currency symbols and number formats from Excel cells MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add helper function _format_cell_value() to preserve currency symbols - Support for USD ($), EUR (€), GBP (£), JPY (¥), and other currencies - Support for percentage formatting - Preserve decimal places from number format - Use openpyxl directly instead of pandas for better format control Fixes #53 --- .../markitdown/converters/_xlsx_converter.py | 128 ++++++++++++++++-- 1 file changed, 116 insertions(+), 12 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 4186ec773..5b89185ec 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -1,5 +1,8 @@ +import re import sys -from typing import BinaryIO, Any +from io import BytesIO +from typing import Any, BinaryIO + from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -10,7 +13,7 @@ _xlsx_dependency_exc_info = None try: import pandas as pd - import openpyxl # noqa: F401 + import openpyxl except ImportError: _xlsx_dependency_exc_info = sys.exc_info() @@ -32,6 +35,109 @@ ] ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] +# Pattern to match currency formats (e.g., "$"#,##0.00, €#,##0.00, £$#,##0.00) +CURRENCY_FORMAT_PATTERN = re.compile(r'["\']([$€£¥₹])["\']|([$€£¥₹])\d|#|0') + + +def _format_cell_value(cell: "openpyxl.cell.Cell") -> str: + """ + Format a cell value, preserving currency and other number formats. + """ + if cell.value is None: + return "" + + # Check if it's a number type + if isinstance(cell.value, (int, float)): + number_format = cell.number_format + + # Check if the number format contains currency symbols + # Common currency formats: "$"#,##0.00, €#,##0.00, $#,##0.00 + if "$" in number_format or "€" in number_format or "£" in number_format or "¥" in number_format or "₹" in number_format: + # Try to use openpyxl's built-in formatting + try: + formatted = openpyxl.styles.numbers.format(cell.value, number_format) + # Clean up the formatted value (remove extra spaces, fix formatting) + formatted = formatted.strip() + if formatted and formatted != str(cell.value): + return formatted + except Exception: + pass + + # Fallback: extract currency symbol from format string + currency_match = re.search(r'["\']([$€£¥₹])["\']|([$€£¥₹])(?=\d|#)', number_format) + if currency_match: + currency_symbol = currency_match.group(1) or currency_match.group(2) + # Format with currency symbol + if isinstance(cell.value, float): + return f"{currency_symbol}{cell.value:,.2f}" + else: + return f"{currency_symbol}{cell.value:,}" + + # Handle percentage format + if "%" in number_format and isinstance(cell.value, (int, float)): + return f"{cell.value * 100:.2f}%" + + # Handle decimal places from format + if "#" in number_format or "0" in number_format: + # Try to preserve decimal places + decimal_match = re.search(r'\.(0+|#+)', number_format) + if decimal_match: + decimal_places = len(decimal_match.group(1)) + if isinstance(cell.value, float): + return f"{cell.value:,.{decimal_places}f}" + + # Default number formatting with thousand separators + if isinstance(cell.value, float): + return f"{cell.value:,.2f}" + elif isinstance(cell.value, int): + return f"{cell.value:,}" + + return str(cell.value) + + +def _convert_sheet_to_markdown(ws: "openpyxl.worksheet.worksheet.Worksheet") -> str: + """ + Convert an openpyxl worksheet to a Markdown table, preserving number formats. + """ + rows = list(ws.iter_rows(values_only=True)) + if not rows: + return "" + + # Get the max column count + max_cols = max(len(row) for row in rows) + + # Build markdown table + lines = [] + + # Header row + header = [str(cell) if cell is not None else "" for cell in rows[0]] + lines.append("| " + " | ".join(header) + " |") + lines.append("| " + " | ".join(["---"] * len(header)) + " |") + + # Data rows - need to use openpyxl cells to get formatting + for row_idx in range(1, len(rows)): + row = rows[row_idx] + # Pad row if needed + row = list(row) + [""] * (max_cols - len(row)) + + # Get cell objects for formatting + cells = list(ws[row_idx + 1])[:max_cols] # +1 because openpyxl is 1-indexed + + formatted_cells = [] + for i, cell in enumerate(cells): + if cell.value is not None: + # Check if we need to use cell object for formatting + if isinstance(cell.value, (int, float)): + formatted_cells.append(_format_cell_value(cell)) + else: + formatted_cells.append(str(cell.value)) + else: + formatted_cells.append("") + + lines.append("| " + " | ".join(formatted_cells) + " |") + + return "\n".join(lines) + class XlsxConverter(DocumentConverter): """ @@ -80,17 +186,15 @@ def convert( _xlsx_dependency_exc_info[2] ) - sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + # Read the Excel file using openpyxl to preserve number formats + file_stream.seek(0) + wb = openpyxl.load_workbook(file_stream, data_only=True) + md_content = "" - for s in sheets: - md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) - md_content += ( - self._html_converter.convert_string( - html_content, **kwargs - ).markdown.strip() - + "\n\n" - ) + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + md_content += f"## {sheet_name}\n" + md_content += _convert_sheet_to_markdown(ws) + "\n\n" return DocumentConverterResult(markdown=md_content.strip())