Infiniloom Python bindings for RAG pipelines and vector database integration.
pip install infiniloomimport infiniloom
# Pack a repository into Claude-optimized XML
context = infiniloom.pack("/path/to/repo", format="xml", model="claude")
# Generate embedding chunks for vector databases
result = infiniloom.embed("/path/to/repo", max_tokens=1000)
for chunk in result["chunks"]:
print(f"{chunk['id']}: {chunk['source']['symbol']}")
# Scan repository statistics
stats = infiniloom.scan("/path/to/repo")
print(f"Files: {stats['total_files']}, Tokens: {stats['total_tokens']['claude']}")
# Count tokens in text
tokens = infiniloom.count_tokens("Hello, world!", model="claude")Pack a repository into an LLM-optimized format.
def pack(
path: str,
format: str = "xml",
model: str = "claude",
compression: str = "balanced",
map_budget: int = 2000,
max_symbols: int = 50,
redact_secrets: bool = True,
skip_symbols: bool = False,
) -> strParameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
path |
str |
required | Path to the repository |
format |
str |
"xml" |
Output format: "xml", "markdown", "json", "yaml", "toon", "plain" |
model |
str |
"claude" |
Target LLM for token counting |
compression |
str |
"balanced" |
Compression level: "none", "minimal", "balanced", "aggressive", "extreme" |
map_budget |
int |
2000 |
Token budget for repository map |
max_symbols |
int |
50 |
Maximum symbols in map |
redact_secrets |
bool |
True |
Redact detected secrets |
skip_symbols |
bool |
False |
Skip symbol extraction (faster) |
Returns: str - Formatted repository context
Example:
# Claude-optimized XML
context = infiniloom.pack("/path/to/repo", format="xml", model="claude")
# GPT-4o Markdown with aggressive compression
context = infiniloom.pack("/path/to/repo", format="markdown", model="gpt4o", compression="aggressive")
# Token-efficient TOON format
context = infiniloom.pack("/path/to/repo", format="toon")Scan a repository and return statistics.
def scan(
path: str,
include_hidden: bool = False,
respect_gitignore: bool = True,
exclude: Optional[List[str]] = None,
) -> ScanResultReturns: ScanResult dictionary:
{
"name": str, # Repository name
"path": str, # Repository path
"total_files": int, # Total files
"total_lines": int, # Total lines of code
"total_tokens": { # Token counts by model
"o200k": int, # GPT-4o, GPT-5 (exact)
"cl100k": int, # GPT-4 (exact)
"claude": int, # Claude (estimated)
"gemini": int, # Gemini (estimated)
"llama": int, # Llama (estimated)
...
},
"languages": [ # Language breakdown
{"language": str, "files": int, "lines": int, "percentage": float}
],
"branch": Optional[str],
"commit": Optional[str],
}Example:
stats = infiniloom.scan("/path/to/repo")
print(f"Repository: {stats['name']}")
print(f"Total files: {stats['total_files']}")
print(f"Claude tokens: {stats['total_tokens']['claude']}")
for lang in stats['languages']:
print(f" {lang['language']}: {lang['files']} files ({lang['percentage']:.1f}%)")Generate embedding chunks for vector databases.
def embed(
path: str,
max_tokens: int = 1000,
min_tokens: int = 50,
context_lines: int = 5,
include_imports: bool = True,
include_top_level: bool = True,
include_tests: bool = False,
security_scan: bool = True,
include_patterns: Optional[List[str]] = None,
exclude_patterns: Optional[List[str]] = None,
manifest_path: Optional[str] = None,
diff_only: bool = False,
) -> EmbedResultParameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
path |
str |
required | Path to repository |
max_tokens |
int |
1000 |
Maximum tokens per chunk |
min_tokens |
int |
50 |
Minimum tokens per chunk |
context_lines |
int |
5 |
Context lines around symbols |
include_imports |
bool |
True |
Include import statements |
include_top_level |
bool |
True |
Include top-level code |
include_tests |
bool |
False |
Include test files |
security_scan |
bool |
True |
Enable secret scanning |
include_patterns |
List[str] |
None |
Glob patterns to include |
exclude_patterns |
List[str] |
None |
Glob patterns to exclude |
manifest_path |
str |
None |
Custom manifest path |
diff_only |
bool |
False |
Only return changed chunks |
Returns: EmbedResult dictionary:
{
"version": int,
"settings": EmbedSettings,
"chunks": [EmbedChunk, ...],
"summary": {
"total_chunks": int,
"total_tokens": int,
"added": Optional[int],
"modified": Optional[int],
"removed": Optional[int],
"unchanged": Optional[int],
},
"diff": Optional[EmbedDiff],
}Chunk structure:
{
"id": "ec_a1b2c3d4...", # Content-addressable ID
"full_hash": "a1b2c3d4...", # Full BLAKE3 hash
"content": "fn foo() {...}", # Chunk content
"tokens": 150, # Token count
"kind": "function", # function, class, method, etc.
"source": {
"file": "src/main.rs",
"lines": [10, 25],
"symbol": "foo",
"fqn": "src::main::foo", # Fully qualified name
"language": "Rust",
"parent": Optional[str],
"visibility": "public",
"is_test": False,
},
"context": {
"docstring": Optional[str],
"signature": Optional[str],
"calls": ["bar", "baz"], # Functions this calls
"called_by": ["main"], # Functions that call this
"imports": ["std::io"],
"tags": ["async", "public-api"], # Auto-generated semantic tags
},
}Example:
# Generate chunks for RAG pipeline
result = infiniloom.embed("/path/to/repo", max_tokens=1500)
for chunk in result["chunks"]:
# Embed and upsert to vector DB
embedding = get_embedding(chunk["content"])
vector_db.upsert(
id=chunk["id"],
vector=embedding,
metadata={
"file": chunk["source"]["file"],
"symbol": chunk["source"]["symbol"],
"kind": chunk["kind"],
"tags": chunk["context"]["tags"],
}
)
# Incremental updates
result = infiniloom.embed("/path/to/repo", diff_only=True)
print(f"Added: {result['summary']['added']}, Modified: {result['summary']['modified']}")Count tokens in text for a specific model.
def count_tokens(text: str, model: str = "claude") -> intExample:
tokens = infiniloom.count_tokens("Hello, world!", model="claude")
print(f"Tokens: {tokens}")
# Exact counting for OpenAI models
gpt4o_tokens = infiniloom.count_tokens(code, model="gpt4o")Scan repository for security issues.
def scan_security(path: str) -> List[SecurityFinding]Returns: List of security findings:
{
"file": str, # File path
"line": int, # Line number
"severity": str, # "Critical", "High", "Medium", "Low"
"kind": str, # Type of finding
"pattern": str, # Matched pattern
}Example:
findings = infiniloom.scan_security("/path/to/repo")
for finding in findings:
if finding["severity"] == "Critical":
print(f"CRITICAL: {finding['kind']} in {finding['file']}:{finding['line']}")Compress text while preserving meaning.
def semantic_compress(
text: str,
similarity_threshold: float = 0.7,
budget_ratio: float = 0.5,
) -> strExample:
long_text = "..." # Your long text
compressed = infiniloom.semantic_compress(long_text, budget_ratio=0.3)
print(f"Reduced from {len(long_text)} to {len(compressed)} chars")Build or update the symbol index for fast queries.
def build_index(
path: str,
force: bool = False,
include_tests: bool = False,
max_file_size: Optional[int] = None,
exclude: Optional[List[str]] = None,
incremental: bool = False,
) -> IndexStatusReturns: IndexStatus dictionary:
{
"exists": bool,
"file_count": int,
"symbol_count": int,
"last_built": Optional[str], # ISO 8601
"version": Optional[str],
"files_updated": Optional[int],
"incremental": bool,
}Example:
# Build index
status = infiniloom.build_index("/path/to/repo")
print(f"Indexed {status['symbol_count']} symbols")
# Incremental update
status = infiniloom.build_index("/path/to/repo", incremental=True)
print(f"Updated {status['files_updated']} files")Find a symbol by name.
def find_symbol(path: str, name: str) -> List[SymbolInfo]Example:
symbols = infiniloom.find_symbol("/path/to/repo", "authenticate")
for sym in symbols:
print(f"{sym['kind']}: {sym['name']} at {sym['file']}:{sym['line']}")Get all functions that call a symbol.
def get_callers(path: str, symbol_name: str) -> List[SymbolInfo]Example:
callers = infiniloom.get_callers("/path/to/repo", "validate_input")
print(f"validate_input is called by {len(callers)} functions")Get all functions that a symbol calls.
def get_callees(path: str, symbol_name: str) -> List[SymbolInfo]Get the complete call graph.
def get_call_graph(
path: str,
max_nodes: Optional[int] = None,
max_edges: Optional[int] = None,
) -> CallGraphReturns: CallGraph dictionary:
{
"nodes": [SymbolInfo, ...],
"edges": [
{
"caller_id": int,
"callee_id": int,
"caller": str,
"callee": str,
"file": str,
"line": int,
}
],
"stats": {
"total_symbols": int,
"total_calls": int,
"functions": int,
"classes": int,
}
}Get all functions that eventually call a symbol (up to max depth).
def get_transitive_callers(
path: str,
symbol_name: str,
max_depth: int = 3,
max_results: int = 100,
) -> List[TransitiveCaller]Example:
callers = infiniloom.get_transitive_callers("/path/to/repo", "dangerous_function", max_depth=5)
for c in callers:
print(f"Depth {c['depth']}: {' -> '.join(c['call_path'])}")Detect circular import/dependency cycles in the codebase.
def find_circular_dependencies(path: str) -> List[DependencyCycle]Returns: List of DependencyCycle dictionaries:
{
"files": List[str], # File paths forming the cycle
"file_ids": List[int], # Internal file IDs
"length": int, # Number of files in the cycle
}Example:
# Build index first
infiniloom.build_index("/path/to/repo")
# Find circular dependencies
cycles = infiniloom.find_circular_dependencies("/path/to/repo")
if cycles:
print(f"Found {len(cycles)} circular dependency cycles:")
for cycle in cycles:
print(f" Cycle of {cycle['length']} files: {' -> '.join(cycle['files'])}")
else:
print("No circular dependencies found")Get all public/exported symbols in the repository or a specific file.
def get_exported_symbols(
path: str,
file_path: Optional[str] = None,
) -> List[SymbolInfo]Parameters:
| Parameter | Type | Description |
|---|---|---|
path |
str |
Path to repository |
file_path |
str |
Optional file path to filter symbols |
Returns: List of SymbolInfo dictionaries for public/exported symbols only.
Example:
# Build index first
infiniloom.build_index("/path/to/repo")
# Get all exported symbols
exports = infiniloom.get_exported_symbols("/path/to/repo")
print(f"Found {len(exports)} exported symbols")
for sym in exports:
print(f" {sym['visibility']} {sym['kind']}: {sym['name']} at {sym['file']}:{sym['line']}")
# Get exports from a specific file
file_exports = infiniloom.get_exported_symbols("/path/to/repo", file_path="src/lib.rs")
print(f"Exports from src/lib.rs: {', '.join(s['name'] for s in file_exports)}")Extract structured documentation from a docstring/comment.
def extract_documentation(raw_doc: str, language: str) -> DocumentationParameters:
| Parameter | Type | Description |
|---|---|---|
raw_doc |
str |
Raw documentation string (JSDoc, Python docstring, etc.) |
language |
str |
Language of the code ("javascript", "python", "rust", etc.) |
Returns: Documentation dictionary:
{
"summary": Optional[str],
"description": Optional[str],
"params": [
{
"name": str,
"type_info": Optional[str],
"description": Optional[str],
"is_optional": bool,
"default_value": Optional[str],
}
],
"returns": {
"type_info": Optional[str],
"description": Optional[str],
},
"throws": [{"exception_type": str, "description": Optional[str]}],
"examples": [{"code": str, "title": Optional[str], "language": Optional[str]}],
"is_deprecated": bool,
"deprecation_message": Optional[str],
"since": Optional[str],
"see_also": List[str],
"tags": Dict[str, List[str]],
"raw": Optional[str],
}Example:
doc = """
@param name - The user's name
@param age - The user's age
@returns The greeting message
@throws ValueError If name is empty
"""
result = infiniloom.extract_documentation(doc, "javascript")
print(f"Params: {len(result['params'])}")
for param in result['params']:
print(f" {param['name']}: {param['description']}")Calculate complexity metrics for source code.
def calculate_complexity(source: str, language: str) -> ComplexityMetricsReturns: ComplexityMetrics dictionary:
{
"cyclomatic": int, # Cyclomatic complexity
"cognitive": int, # Cognitive complexity
"halstead": {
"distinct_operators": int,
"distinct_operands": int,
"total_operators": int,
"total_operands": int,
"vocabulary": int,
"length": int,
"calculated_length": float,
"volume": float,
"difficulty": float,
"effort": float,
"time": float,
"bugs": float,
},
"loc": {
"total": int,
"source": int,
"comments": int,
"blank": int,
},
"maintainability_index": Optional[float],
"max_nesting_depth": int,
"parameter_count": int,
"return_count": int,
}Example:
code = """
def complex_function(a, b, c):
if a > 0:
if b > 0:
return a + b
else:
return a - b
return c
"""
metrics = infiniloom.calculate_complexity(code, "python")
print(f"Cyclomatic: {metrics['cyclomatic']}")
print(f"Cognitive: {metrics['cognitive']}")
print(f"Maintainability: {metrics['maintainability_index']:.1f}")Check code complexity against thresholds.
def check_complexity(
source: str,
language: str,
max_cyclomatic: int = 10,
max_cognitive: int = 15,
max_nesting: int = 4,
max_params: int = 5,
min_maintainability: float = 40.0,
) -> List[ComplexityIssue]Returns: List of complexity issues:
{
"message": str, # Description of the issue
"severity": str, # "warning" or "error"
}Example:
issues = infiniloom.check_complexity(code, "python", max_cyclomatic=5)
for issue in issues:
print(f"[{issue['severity']}] {issue['message']}")Detect dead code in a repository.
def detect_dead_code(
path: str,
languages: Optional[List[str]] = None,
) -> DeadCodeInfoReturns: DeadCodeInfo dictionary:
{
"unused_exports": [
{
"name": str,
"kind": str,
"file_path": str,
"line": int,
"confidence": float,
"reason": str,
}
],
"unreachable_code": [
{
"file_path": str,
"start_line": int,
"end_line": int,
"reason": str,
"snippet": str,
}
],
"unused_private": [
{"name": str, "kind": str, "file_path": str, "line": int}
],
"unused_imports": [
{"name": str, "import_path": str, "file_path": str, "line": int}
],
"unused_variables": [
{"name": str, "file_path": str, "line": int, "scope": str}
],
}Example:
result = infiniloom.detect_dead_code("/path/to/repo")
print(f"Unused exports: {len(result['unused_exports'])}")
print(f"Unreachable code: {len(result['unreachable_code'])}")
for export in result['unused_exports']:
print(f" {export['name']} in {export['file_path']}:{export['line']}")Detect breaking changes between two versions.
def detect_breaking_changes(
path: str,
old_ref: str,
new_ref: str,
) -> BreakingChangeReportReturns: BreakingChangeReport dictionary:
{
"changes": [
{
"symbol_name": str,
"change_type": str, # "removed", "signature_changed", "visibility_changed"
"severity": str, # "breaking", "warning", "info"
"file_path": str,
"old_line": Optional[int],
"new_line": Optional[int],
"old_value": Optional[str],
"new_value": Optional[str],
"description": str,
"suggestion": Optional[str],
}
],
"summary": {
"total_changes": int,
"by_severity": {"breaking": int, "warning": int, "info": int},
"by_type": {"removed": int, "signature_changed": int, ...},
},
"old_ref": str,
"new_ref": str,
}Example:
report = infiniloom.detect_breaking_changes("/path/to/repo", "v1.0.0", "v2.0.0")
print(f"Breaking changes: {report['summary']['by_severity'].get('breaking', 0)}")
for change in report['changes']:
if change['severity'] == 'breaking':
print(f" BREAKING: {change['symbol_name']} - {change['description']}")Get the type hierarchy for a class/interface.
def get_type_hierarchy(path: str, symbol_name: str) -> TypeHierarchyReturns: TypeHierarchy dictionary:
{
"name": str,
"kind": str,
"file": str,
"line": int,
"ancestors": [
{
"name": str,
"file": str,
"line": int,
"kind": str,
"depth": int,
"is_direct": bool,
}
],
"descendants": [...],
"interfaces": List[str],
}Example:
hierarchy = infiniloom.get_type_hierarchy("/path/to/repo", "BaseController")
print(f"Ancestors: {[a['name'] for a in hierarchy['ancestors']]}")
print(f"Descendants: {[d['name'] for d in hierarchy['descendants']]}")Get all ancestors (parent classes/interfaces) of a type.
def get_type_ancestors(path: str, symbol_name: str) -> List[AncestorInfo]Get all descendants (child classes) of a type.
def get_type_descendants(path: str, symbol_name: str) -> List[AncestorInfo]Get all classes that implement an interface.
def get_implementors(path: str, interface_name: str) -> List[SymbolInfo]Example:
implementors = infiniloom.get_implementors("/path/to/repo", "Serializable")
print(f"Classes implementing Serializable: {len(implementors)}")
for impl in implementors:
print(f" {impl['name']} at {impl['file']}:{impl['line']}")Get context-aware diff with surrounding symbols.
def get_diff_context(
path: str,
from_ref: str = "",
to_ref: str = "HEAD",
depth: int = 2,
budget: int = 50000,
include_diff: bool = False,
model: Optional[str] = None,
exclude: Optional[List[str]] = None,
include: Optional[List[str]] = None,
) -> DiffContextExample:
# Get context for staged changes
context = infiniloom.get_diff_context("/path/to/repo", from_ref="", to_ref="HEAD", include_diff=True)
print(f"Changed files: {len(context['changed_files'])}")
print(f"Related symbols: {len(context['context_symbols'])}")
print(f"Related tests: {len(context['related_tests'])}")Analyze the impact of changes to files.
def analyze_impact(
path: str,
files: List[str],
depth: int = 2,
include_tests: bool = False,
model: Optional[str] = None,
) -> ImpactResultExample:
impact = infiniloom.analyze_impact("/path/to/repo", ["src/auth.py"])
print(f"Impact level: {impact['impact_level']}")
print(f"Dependent files: {impact['dependent_files']}")
print(f"Test files to run: {impact['test_files']}")Object-oriented interface for repository operations.
class Infiniloom:
def __init__(self, path: str) -> None
def load(self, include_hidden: bool = False, respect_gitignore: bool = True) -> None
def stats(self) -> ScanResult
def pack(self, format: str = "xml", model: str = "claude", ...) -> str
def map(self, map_budget: int = 2000, max_symbols: int = 50) -> RepoMap
def scan_security(self) -> List[SecurityFinding]
def files(self) -> List[FileInfo]Example:
from infiniloom import Infiniloom
loom = Infiniloom("/path/to/repo")
stats = loom.stats()
print(f"Repository: {stats['name']}, Files: {stats['total_files']}")
context = loom.pack(format="xml", model="claude")
findings = loom.scan_security()Git repository operations.
class GitRepo:
def __init__(self, path: str) -> None
def current_branch(self) -> str
def current_commit(self) -> str
def status(self) -> List[ChangedFileInfo]
def diff_files(self, from_ref: str, to_ref: str) -> List[ChangedFileInfo]
def log(self, count: int = 10) -> List[Commit]
def file_log(self, path: str, count: int = 10) -> List[Commit]
def blame(self, path: str) -> List[BlameInfo]
def ls_files(self) -> List[str]
def diff_hunks(self, from_ref: str, to_ref: str, path: Optional[str] = None) -> List[DiffHunk]Example:
from infiniloom import GitRepo, is_git_repo
if is_git_repo("/path/to/repo"):
repo = GitRepo("/path/to/repo")
print(f"Branch: {repo.current_branch()}")
print(f"Commit: {repo.current_commit()}")
for commit in repo.log(count=5):
print(f"{commit['short_hash']}: {commit['message']}")All core functions have async versions:
import asyncio
import infiniloom
async def main():
context = await infiniloom.pack_async("/path/to/repo")
stats = await infiniloom.scan_async("/path/to/repo")
result = await infiniloom.embed_async("/path/to/repo")
asyncio.run(main())Available async functions:
pack_async()scan_async()count_tokens_async()scan_security_async()semantic_compress_async()build_index_async()chunk_async()analyze_impact_async()get_diff_context_async()find_symbol_async()get_callers_async()get_callees_async()get_references_async()get_call_graph_async()
Base exception for all Infiniloom errors.
from infiniloom import InfiniloomError
try:
context = infiniloom.pack("/nonexistent/path")
except InfiniloomError as e:
print(f"Error: {e}")| Model | Accuracy | Description |
|---|---|---|
claude |
~95% prose, ~85% code | Anthropic Claude (default) |
gpt52, gpt51, gpt5 |
Exact | OpenAI GPT-5 series |
o4-mini, o3, o1 |
Exact | OpenAI reasoning models |
gpt4o, gpt4o-mini |
Exact | OpenAI GPT-4o |
gpt4, gpt35-turbo |
Exact | OpenAI legacy |
gemini |
~95% prose, ~85% code | Google Gemini |
llama, codellama |
~95% prose, ~85% code | Meta Llama |
mistral |
~95% prose, ~85% code | Mistral AI |
deepseek |
~95% prose, ~85% code | DeepSeek |
qwen |
~95% prose, ~85% code | Alibaba Qwen |
cohere |
~95% prose, ~85% code | Cohere |
grok |
~95% prose, ~85% code | xAI Grok |
import json
from pinecone import Pinecone
from openai import OpenAI
import infiniloom
# Initialize
pc = Pinecone(api_key="...")
index = pc.Index("code-embeddings")
openai = OpenAI()
# Generate chunks
result = infiniloom.embed("./my-repo", max_tokens=1500)
# Embed and upsert
for chunk in result["chunks"]:
response = openai.embeddings.create(
model="text-embedding-3-small",
input=chunk["content"]
)
index.upsert(vectors=[{
"id": chunk["id"],
"values": response.data[0].embedding,
"metadata": {
"file": chunk["source"]["file"],
"symbol": chunk["source"]["symbol"],
"language": chunk["source"]["language"],
"kind": chunk["kind"],
"tags": chunk["context"]["tags"],
}
}])from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import infiniloom
# Generate chunks
result = infiniloom.embed("./my-repo", max_tokens=1000)
# Convert to LangChain documents
from langchain.schema import Document
docs = [
Document(
page_content=chunk["content"],
metadata={
"id": chunk["id"],
"file": chunk["source"]["file"],
"symbol": chunk["source"]["symbol"],
}
)
for chunk in result["chunks"]
]
# Create vector store
vectorstore = Chroma.from_documents(docs, embedding_function)