mfarragher · JavierGOrdonnez · Mar 5, 2026 · Mar 5, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,35 +1,66 @@
-name: codecov
+name: CI
+
 on:
   pull_request:
     branches-ignore:
       - main
   push:
     branches:
       - main
+
 jobs:
-  run:
+  test:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest]
-        python-version: ['3.13', '3.12', '3.11', '3.10', '3.9']
+        python-version: ["3.13", "3.12", "3.11", "3.10", "3.9"]
     env:
       OS: ${{ matrix.os }}
     steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
         with:
-          python-version: ${{ matrix.python-version }}
-          node-version: 16
+          enable-cache: true
+
+      - name: Set up Python ${{ matrix.python-version }}
+        run: uv python install ${{ matrix.python-version }}
+
       - name: Install dependencies
-        run: |
-          pip install -r requirements.txt --use-pep517
-          pip install -r requirements_test.txt --use-pep517
-          python setup.py sdist bdist_wheel
-          pip install dist/*.whl
+        run: uv sync --python ${{ matrix.python-version }}
+
       - name: Run tests and collect coverage
-        run: python -m pytest tests/ --cov=./ --cov-report=xml
+        run: uv run pytest
+
       - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
+        uses: codecov/codecov-action@v4
+
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Ruff lint
+        run: uvx ruff check .
+
+      - name: Ruff format check
+        run: uvx ruff format --check .
+
+  typecheck:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Install ty
+        run: uv tool install ty
+
+      - name: Run ty
+        run: uv run ty check obsidiantools
diff --git a/.gitignore b/.gitignore
@@ -140,3 +140,4 @@ notebooks/
 .gitignore
 junit.xml
 codecov.yml
+.claude/settings.local.json
diff --git a/README.md b/README.md
@@ -7,11 +7,37 @@
 It's incredibly easy to explore structured data on your vault through this fluent interface.  This is all the code you need to generate a `vault` object that stores all the data:
 
 ```python
+from pathlib import Path
 import obsidiantools.api as otools
 
-vault = otools.Vault(<VAULT_DIRECTORY>).connect().gather()
+vault = otools.Vault(Path(<VAULT_DIRECTORY>)).connect().gather()
 ```
 
+You can store your vault path in a `.env` file (ignored by git) so you don't have to retype it:
+
+```
+# .env
+VAULT_DIR=C:\Users\me\My Vault
+```
+
+```python
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+import obsidiantools.api as otools
+
+load_dotenv()
+vault = otools.Vault(Path(os.environ["VAULT_DIR"])).connect().gather()
+```
+
+To speed up processing on large vaults, enable parallel loading with the `workers` parameter:
+
+```python
+vault = otools.Vault(Path(os.environ["VAULT_DIR"]), workers=8).connect().gather()
+```
+
+The `workers` value set at initialisation is used by both `connect()` and `gather()`.  You can override it per-call, e.g. `.connect(workers=4).gather(workers=2)`.
+
 These are the basics of the method calls:
 - `connect()`: connect your notes together in a graph structure and get metadata on links (e.g. wikilinks, backlinks, etc.)  There ais the option to support the inclusion of 'attachment' files in the graph.
 - `gather()`: gather the plaintext content from your notes in one place.  This includes the 'source text' that represent how your notes are written.  There are arguments to support what text you want to remove, e.g. remove code.

diff --git a/obsidiantools/__init__.py b/obsidiantools/__init__.py
@@ -1,5 +1,25 @@
-from . import api
-from . import md_utils
-from . import html_processing
-from . import canvas_utils
-from . import media_utils
+import logging
+
+from . import api as api
+from . import canvas_utils as canvas_utils
+from . import html_processing as html_processing
+from . import md_utils as md_utils
+from . import media_utils as media_utils
+
+
+def enable_debug(level: int = logging.DEBUG) -> None:
+    """Enable debug logging for obsidiantools.
+
+    Usage:
+        import obsidiantools
+        obsidiantools.enable_debug()  # DEBUG level
+        obsidiantools.enable_debug(logging.INFO)  # INFO level only
+    """
+    pkg_logger = logging.getLogger("obsidiantools")
+    pkg_logger.setLevel(level)
+    if not pkg_logger.handlers:
+        handler = logging.StreamHandler()
+        handler.setFormatter(
+            logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")
+        )
+        pkg_logger.addHandler(handler)
diff --git a/obsidiantools/_constants.py b/obsidiantools/_constants.py
@@ -1,36 +1,77 @@
 # WIKILINKS AND EMBEDDED FILES: regex that includes any aliases
 # group 0 captures embedded link; group 1 is everything inside [[]]
-WIKILINK_REGEX = r'(!)?\[{2}([^\]\]]+)\]{2}'
+WIKILINK_REGEX = r"(!)?\[{2}([^\]\]]+)\]{2}"
 
 # TAGS
-TAG_INCLUDE_NESTED_REGEX = r'(?<!\()(?<!\\)#{1}([A-z]+[0-9_\-]*[A-Z0-9]?[^\s]+(?![^\[\[]*\]\]))\/?'
-TAG_MAIN_ONLY_REGEX = r'(?<!\()#{1}([A-z]+[0-9_\-]*[A-Z0-9]?)\/?'
+TAG_INCLUDE_NESTED_REGEX = (
+    r"(?<!\()(?<!\\)#{1}([A-z]+[0-9_\-]*[A-Z0-9]?[^\s]+(?![^\[\[]*\]\]))\/?"
+)
+TAG_MAIN_ONLY_REGEX = r"(?<!\()#{1}([A-z]+[0-9_\-]*[A-Z0-9]?)\/?"
 
 # md links: catch URLs or paths
-INLINE_LINK_AFTER_HTML_PROC_REGEX = r'\[([^\]]+)\]\(<([^)]+)>\)'
-INLINE_LINK_VIA_MD_ONLY_REGEX = r'\[([^\]]+)\]\(([^)]+)\)'
+INLINE_LINK_AFTER_HTML_PROC_REGEX = r"\[([^\]]+)\]\(<([^)]+)>\)"
+INLINE_LINK_VIA_MD_ONLY_REGEX = r"\[([^\]]+)\]\(([^)]+)\)"
 
 # helpers:
-WIKILINK_AS_STRING_REGEX = r'\[[^\]]+\]\([^)]+\)'
-EMBEDDED_FILE_LINK_AS_STRING_REGEX = r'!?\[{2}([^\]\]]+)\]{2}'
+WIKILINK_AS_STRING_REGEX = r"\[[^\]]+\]\([^)]+\)"
+EMBEDDED_FILE_LINK_AS_STRING_REGEX = r"!?\[{2}([^\]\]]+)\]{2}"
 
 # Sets of extensions via https://help.obsidian.md/How+to/Embed+files :
 # NB: file.ext and file.EXT can exist in same folder
-IMG_EXT_SET = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg',
-               '.PNG', '.JPG', '.JPEG', '.GIF', '.BMP', '.SVG'}
-AUDIO_EXT_SET = {'.mp3', '.webm', '.wav', '.m4a', '.ogg', '.3gp', '.flac',
-                 '.MP3', '.WEBM', '.WAV', '.M4A', '.OGG', '.3GP', '.FLAC'}
-VIDEO_EXT_SET = {'.mp4', '.webm', '.ogv', '.mov', '.mkv',
-                 '.MP4', '.WEBM', '.OGV', '.MOV', '.MKV'}
-PDF_EXT_SET = {'.pdf',
-               '.PDF'}
+IMG_EXT_SET = {
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".gif",
+    ".bmp",
+    ".svg",
+    ".PNG",
+    ".JPG",
+    ".JPEG",
+    ".GIF",
+    ".BMP",
+    ".SVG",
+}
+AUDIO_EXT_SET = {
+    ".mp3",
+    ".webm",
+    ".wav",
+    ".m4a",
+    ".ogg",
+    ".3gp",
+    ".flac",
+    ".MP3",
+    ".WEBM",
+    ".WAV",
+    ".M4A",
+    ".OGG",
+    ".3GP",
+    ".FLAC",
+}
+VIDEO_EXT_SET = {
+    ".mp4",
+    ".webm",
+    ".ogv",
+    ".mov",
+    ".mkv",
+    ".MP4",
+    ".WEBM",
+    ".OGV",
+    ".MOV",
+    ".MKV",
+}
+PDF_EXT_SET = {".pdf", ".PDF"}
 # canvas files:
-CANVAS_EXT_SET = {'.canvas',
-                  '.CANVAS'}
+CANVAS_EXT_SET = {".canvas", ".CANVAS"}
 
 # metadata df cols order:
 METADATA_DF_COLS_GENERIC_TYPE = [
-    'rel_filepath', 'abs_filepath',
-    'file_exists',
-    'n_backlinks', 'n_wikilinks', 'n_tags', 'n_embedded_files',
-    'modified_time']
+    "rel_filepath",
+    "abs_filepath",
+    "file_exists",
+    "n_backlinks",
+    "n_wikilinks",
+    "n_tags",
+    "n_embedded_files",
+    "modified_time",
+]
diff --git a/obsidiantools/_io.py b/obsidiantools/_io.py
@@ -1,5 +1,6 @@
-from pathlib import Path
 from glob import glob
+from pathlib import Path
+
 import numpy as np
 
 
@@ -18,16 +19,20 @@ def get_relpaths_from_dir(dir_path: Path, *, extension: str) -> list[Path]:
     Returns:
         list of Path objects
     """
-    relpaths_list = [Path(p).relative_to(dir_path)
-                     for p in glob(f"{dir_path}/**/*.{extension}",
-                     recursive=True)]
+    relpaths_list = [
+        Path(p).relative_to(dir_path)
+        for p in glob(f"{dir_path}/**/*.{extension}", recursive=True)
+    ]
     return relpaths_list
 
 
-def get_relpaths_matching_subdirs(dir_path: Path, *,
-                                  extension: str,
-                                  include_subdirs: list = None,
-                                  include_root: bool = True) -> list[Path]:
+def get_relpaths_matching_subdirs(
+    dir_path: Path,
+    *,
+    extension: str,
+    include_subdirs: list = None,
+    include_root: bool = True,
+) -> list[Path]:
     """Get list of relative paths for {extension} files in a given directory,
     filtered to include specified subdirectories (with include_subdirs
     kwarg).  The default arguments align with get_relpaths_from_dir
@@ -61,34 +66,35 @@ def get_relpaths_matching_subdirs(dir_path: Path, *,
     # forward slash consistently here.
 
     if include_subdirs:
-        include_subdirs_final = [str(Path(i).as_posix())
-                                 for i in include_subdirs]
+        include_subdirs_final = [str(Path(i).as_posix()) for i in include_subdirs]
 
     if not include_subdirs and include_root:
-        return get_relpaths_from_dir(dir_path,
-                                     extension=extension)
+        return get_relpaths_from_dir(dir_path, extension=extension)
     elif not include_subdirs and not include_root:
-        return [i for i in get_relpaths_from_dir(dir_path,
-                                                 extension=extension)
-                if str(i.parent.as_posix()) != '.']
+        return [
+            i
+            for i in get_relpaths_from_dir(dir_path, extension=extension)
+            if str(i.parent.as_posix()) != "."
+        ]
     else:
         if include_root:
-            return [i for i in get_relpaths_from_dir(dir_path,
-                                                     extension=extension)
-                    if str(i.parent.as_posix())
-                    in include_subdirs_final + ['.']]
+            return [
+                i
+                for i in get_relpaths_from_dir(dir_path, extension=extension)
+                if str(i.parent.as_posix()) in include_subdirs_final + ["."]
+            ]
         else:
-            return [i for i in get_relpaths_from_dir(dir_path,
-                                                     extension=extension)
-                    if str(i.parent.as_posix())
-                    in include_subdirs_final]
+            return [
+                i
+                for i in get_relpaths_from_dir(dir_path, extension=extension)
+                if str(i.parent.as_posix()) in include_subdirs_final
+            ]
 
 
-def _get_valid_filepaths_by_ext_set(dirpath: Path, *,
-                                    exts: set[str]):
-    all_files = [p.relative_to(dirpath)
-                 for p in Path(dirpath).glob("**/*")
-                 if p.suffix in exts]
+def _get_valid_filepaths_by_ext_set(dirpath: Path, *, exts: set[str]):
+    all_files = [
+        p.relative_to(dirpath) for p in Path(dirpath).glob("**/*") if p.suffix in exts
+    ]
     return all_files
 
 
@@ -98,15 +104,13 @@ def _get_shortest_path_by_filename(relpaths_list: list[Path]) -> dict[str, Path]
 
     # get indices of dupe 'filename w/ ext':
     _, inverse_ix, counts = np.unique(
-        np.array(all_file_names_list),
-        return_inverse=True,
-        return_counts=True,
-        axis=0)
+        np.array(all_file_names_list), return_inverse=True, return_counts=True, axis=0
+    )
     dupe_names_ix = np.where(counts[inverse_ix] > 1)[0]
 
     # get shortest paths via mask:
     shortest_paths_arr = np.array(all_file_names_list, dtype=object)
     shortest_paths_arr[dupe_names_ix] = np.array(
-        [str(fpath)
-         for fpath in relpaths_list])[dupe_names_ix]
-    return {fn: path for fn, path in zip(shortest_paths_arr, relpaths_list)}
+        [fpath.as_posix() for fpath in relpaths_list]
+    )[dupe_names_ix]
+    return dict(zip(shortest_paths_arr, relpaths_list))