Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 116 additions & 12 deletions packages/markitdown/src/markitdown/converters/_xlsx_converter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import re
import sys
from typing import BinaryIO, Any
from io import BytesIO
from typing import Any, BinaryIO

from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
Expand All @@ -10,7 +13,7 @@
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl # noqa: F401
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()

Expand All @@ -32,6 +35,109 @@
]
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]

# Pattern to match currency formats (e.g., "$"#,##0.00, €#,##0.00, £$#,##0.00)
CURRENCY_FORMAT_PATTERN = re.compile(r'["\']([$€£¥₹])["\']|([$€£¥₹])\d|#|0')


def _format_cell_value(cell: "openpyxl.cell.Cell") -> str:
"""
Format a cell value, preserving currency and other number formats.
"""
if cell.value is None:
return ""

# Check if it's a number type
if isinstance(cell.value, (int, float)):
number_format = cell.number_format

# Check if the number format contains currency symbols
# Common currency formats: "$"#,##0.00, €#,##0.00, $#,##0.00
if "$" in number_format or "€" in number_format or "£" in number_format or "¥" in number_format or "₹" in number_format:
# Try to use openpyxl's built-in formatting
try:
formatted = openpyxl.styles.numbers.format(cell.value, number_format)
# Clean up the formatted value (remove extra spaces, fix formatting)
formatted = formatted.strip()
if formatted and formatted != str(cell.value):
return formatted
except Exception:
pass

# Fallback: extract currency symbol from format string
currency_match = re.search(r'["\']([$€£¥₹])["\']|([$€£¥₹])(?=\d|#)', number_format)
if currency_match:
currency_symbol = currency_match.group(1) or currency_match.group(2)
# Format with currency symbol
if isinstance(cell.value, float):
return f"{currency_symbol}{cell.value:,.2f}"
else:
return f"{currency_symbol}{cell.value:,}"

# Handle percentage format
if "%" in number_format and isinstance(cell.value, (int, float)):
return f"{cell.value * 100:.2f}%"

# Handle decimal places from format
if "#" in number_format or "0" in number_format:
# Try to preserve decimal places
decimal_match = re.search(r'\.(0+|#+)', number_format)
if decimal_match:
decimal_places = len(decimal_match.group(1))
if isinstance(cell.value, float):
return f"{cell.value:,.{decimal_places}f}"

# Default number formatting with thousand separators
if isinstance(cell.value, float):
return f"{cell.value:,.2f}"
elif isinstance(cell.value, int):
return f"{cell.value:,}"

return str(cell.value)


def _convert_sheet_to_markdown(ws: "openpyxl.worksheet.worksheet.Worksheet") -> str:
"""
Convert an openpyxl worksheet to a Markdown table, preserving number formats.
"""
rows = list(ws.iter_rows(values_only=True))
if not rows:
return ""

# Get the max column count
max_cols = max(len(row) for row in rows)

# Build markdown table
lines = []

# Header row
header = [str(cell) if cell is not None else "" for cell in rows[0]]
lines.append("| " + " | ".join(header) + " |")
lines.append("| " + " | ".join(["---"] * len(header)) + " |")

# Data rows - need to use openpyxl cells to get formatting
for row_idx in range(1, len(rows)):
row = rows[row_idx]
# Pad row if needed
row = list(row) + [""] * (max_cols - len(row))

# Get cell objects for formatting
cells = list(ws[row_idx + 1])[:max_cols] # +1 because openpyxl is 1-indexed

formatted_cells = []
for i, cell in enumerate(cells):
if cell.value is not None:
# Check if we need to use cell object for formatting
if isinstance(cell.value, (int, float)):
formatted_cells.append(_format_cell_value(cell))
else:
formatted_cells.append(str(cell.value))
else:
formatted_cells.append("")

lines.append("| " + " | ".join(formatted_cells) + " |")

return "\n".join(lines)


class XlsxConverter(DocumentConverter):
"""
Expand Down Expand Up @@ -80,17 +186,15 @@ def convert(
_xlsx_dependency_exc_info[2]
)

sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
# Read the Excel file using openpyxl to preserve number formats
file_stream.seek(0)
wb = openpyxl.load_workbook(file_stream, data_only=True)

md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
md_content += f"## {sheet_name}\n"
md_content += _convert_sheet_to_markdown(ws) + "\n\n"

return DocumentConverterResult(markdown=md_content.strip())

Expand Down