Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 47 additions & 16 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,35 +1,66 @@
name: codecov
name: CI

on:
pull_request:
branches-ignore:
- main
push:
branches:
- main

jobs:
run:
test:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
python-version: ['3.13', '3.12', '3.11', '3.10', '3.9']
python-version: ["3.13", "3.12", "3.11", "3.10", "3.9"]
env:
OS: ${{ matrix.os }}
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
- uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v5
with:
python-version: ${{ matrix.python-version }}
node-version: 16
enable-cache: true

- name: Set up Python ${{ matrix.python-version }}
run: uv python install ${{ matrix.python-version }}

- name: Install dependencies
run: |
pip install -r requirements.txt --use-pep517
pip install -r requirements_test.txt --use-pep517
python setup.py sdist bdist_wheel
pip install dist/*.whl
run: uv sync --python ${{ matrix.python-version }}

- name: Run tests and collect coverage
run: python -m pytest tests/ --cov=./ --cov-report=xml
run: uv run pytest

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4

lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v5

- name: Ruff lint
run: uvx ruff check .

- name: Ruff format check
run: uvx ruff format --check .

typecheck:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v5

- name: Install ty
run: uv tool install ty

- name: Run ty
run: uv run ty check obsidiantools
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,4 @@ notebooks/
.gitignore
junit.xml
codecov.yml
.claude/settings.local.json
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,37 @@
It's incredibly easy to explore structured data on your vault through this fluent interface. This is all the code you need to generate a `vault` object that stores all the data:

```python
from pathlib import Path
import obsidiantools.api as otools

vault = otools.Vault(<VAULT_DIRECTORY>).connect().gather()
vault = otools.Vault(Path(<VAULT_DIRECTORY>)).connect().gather()
```

You can store your vault path in a `.env` file (ignored by git) so you don't have to retype it:

```
# .env
VAULT_DIR=C:\Users\me\My Vault
```

```python
import os
from pathlib import Path
from dotenv import load_dotenv
import obsidiantools.api as otools

load_dotenv()
vault = otools.Vault(Path(os.environ["VAULT_DIR"])).connect().gather()
```

To speed up processing on large vaults, enable parallel loading with the `workers` parameter:

```python
vault = otools.Vault(Path(os.environ["VAULT_DIR"]), workers=8).connect().gather()
```

The `workers` value set at initialisation is used by both `connect()` and `gather()`. You can override it per-call, e.g. `.connect(workers=4).gather(workers=2)`.

These are the basics of the method calls:
- `connect()`: connect your notes together in a graph structure and get metadata on links (e.g. wikilinks, backlinks, etc.) There ais the option to support the inclusion of 'attachment' files in the graph.
- `gather()`: gather the plaintext content from your notes in one place. This includes the 'source text' that represent how your notes are written. There are arguments to support what text you want to remove, e.g. remove code.
Expand Down
30 changes: 25 additions & 5 deletions obsidiantools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
from . import api
from . import md_utils
from . import html_processing
from . import canvas_utils
from . import media_utils
import logging

from . import api as api
from . import canvas_utils as canvas_utils
from . import html_processing as html_processing
from . import md_utils as md_utils
from . import media_utils as media_utils


def enable_debug(level: int = logging.DEBUG) -> None:
"""Enable debug logging for obsidiantools.

Usage:
import obsidiantools
obsidiantools.enable_debug() # DEBUG level
obsidiantools.enable_debug(logging.INFO) # INFO level only
"""
pkg_logger = logging.getLogger("obsidiantools")
pkg_logger.setLevel(level)
if not pkg_logger.handlers:
handler = logging.StreamHandler()
handler.setFormatter(
logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")
)
pkg_logger.addHandler(handler)
83 changes: 62 additions & 21 deletions obsidiantools/_constants.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,77 @@
# WIKILINKS AND EMBEDDED FILES: regex that includes any aliases
# group 0 captures embedded link; group 1 is everything inside [[]]
WIKILINK_REGEX = r'(!)?\[{2}([^\]\]]+)\]{2}'
WIKILINK_REGEX = r"(!)?\[{2}([^\]\]]+)\]{2}"

# TAGS
TAG_INCLUDE_NESTED_REGEX = r'(?<!\()(?<!\\)#{1}([A-z]+[0-9_\-]*[A-Z0-9]?[^\s]+(?![^\[\[]*\]\]))\/?'
TAG_MAIN_ONLY_REGEX = r'(?<!\()#{1}([A-z]+[0-9_\-]*[A-Z0-9]?)\/?'
TAG_INCLUDE_NESTED_REGEX = (
r"(?<!\()(?<!\\)#{1}([A-z]+[0-9_\-]*[A-Z0-9]?[^\s]+(?![^\[\[]*\]\]))\/?"
)
TAG_MAIN_ONLY_REGEX = r"(?<!\()#{1}([A-z]+[0-9_\-]*[A-Z0-9]?)\/?"

# md links: catch URLs or paths
INLINE_LINK_AFTER_HTML_PROC_REGEX = r'\[([^\]]+)\]\(<([^)]+)>\)'
INLINE_LINK_VIA_MD_ONLY_REGEX = r'\[([^\]]+)\]\(([^)]+)\)'
INLINE_LINK_AFTER_HTML_PROC_REGEX = r"\[([^\]]+)\]\(<([^)]+)>\)"
INLINE_LINK_VIA_MD_ONLY_REGEX = r"\[([^\]]+)\]\(([^)]+)\)"

# helpers:
WIKILINK_AS_STRING_REGEX = r'\[[^\]]+\]\([^)]+\)'
EMBEDDED_FILE_LINK_AS_STRING_REGEX = r'!?\[{2}([^\]\]]+)\]{2}'
WIKILINK_AS_STRING_REGEX = r"\[[^\]]+\]\([^)]+\)"
EMBEDDED_FILE_LINK_AS_STRING_REGEX = r"!?\[{2}([^\]\]]+)\]{2}"

# Sets of extensions via https://help.obsidian.md/How+to/Embed+files :
# NB: file.ext and file.EXT can exist in same folder
IMG_EXT_SET = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg',
'.PNG', '.JPG', '.JPEG', '.GIF', '.BMP', '.SVG'}
AUDIO_EXT_SET = {'.mp3', '.webm', '.wav', '.m4a', '.ogg', '.3gp', '.flac',
'.MP3', '.WEBM', '.WAV', '.M4A', '.OGG', '.3GP', '.FLAC'}
VIDEO_EXT_SET = {'.mp4', '.webm', '.ogv', '.mov', '.mkv',
'.MP4', '.WEBM', '.OGV', '.MOV', '.MKV'}
PDF_EXT_SET = {'.pdf',
'.PDF'}
IMG_EXT_SET = {
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".svg",
".PNG",
".JPG",
".JPEG",
".GIF",
".BMP",
".SVG",
}
AUDIO_EXT_SET = {
".mp3",
".webm",
".wav",
".m4a",
".ogg",
".3gp",
".flac",
".MP3",
".WEBM",
".WAV",
".M4A",
".OGG",
".3GP",
".FLAC",
}
VIDEO_EXT_SET = {
".mp4",
".webm",
".ogv",
".mov",
".mkv",
".MP4",
".WEBM",
".OGV",
".MOV",
".MKV",
}
PDF_EXT_SET = {".pdf", ".PDF"}
# canvas files:
CANVAS_EXT_SET = {'.canvas',
'.CANVAS'}
CANVAS_EXT_SET = {".canvas", ".CANVAS"}

# metadata df cols order:
METADATA_DF_COLS_GENERIC_TYPE = [
'rel_filepath', 'abs_filepath',
'file_exists',
'n_backlinks', 'n_wikilinks', 'n_tags', 'n_embedded_files',
'modified_time']
"rel_filepath",
"abs_filepath",
"file_exists",
"n_backlinks",
"n_wikilinks",
"n_tags",
"n_embedded_files",
"modified_time",
]
74 changes: 39 additions & 35 deletions obsidiantools/_io.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path
from glob import glob
from pathlib import Path

import numpy as np


Expand All @@ -18,16 +19,20 @@ def get_relpaths_from_dir(dir_path: Path, *, extension: str) -> list[Path]:
Returns:
list of Path objects
"""
relpaths_list = [Path(p).relative_to(dir_path)
for p in glob(f"{dir_path}/**/*.{extension}",
recursive=True)]
relpaths_list = [
Path(p).relative_to(dir_path)
for p in glob(f"{dir_path}/**/*.{extension}", recursive=True)
]
return relpaths_list


def get_relpaths_matching_subdirs(dir_path: Path, *,
extension: str,
include_subdirs: list = None,
include_root: bool = True) -> list[Path]:
def get_relpaths_matching_subdirs(
dir_path: Path,
*,
extension: str,
include_subdirs: list = None,
include_root: bool = True,
) -> list[Path]:
"""Get list of relative paths for {extension} files in a given directory,
filtered to include specified subdirectories (with include_subdirs
kwarg). The default arguments align with get_relpaths_from_dir
Expand Down Expand Up @@ -61,34 +66,35 @@ def get_relpaths_matching_subdirs(dir_path: Path, *,
# forward slash consistently here.

if include_subdirs:
include_subdirs_final = [str(Path(i).as_posix())
for i in include_subdirs]
include_subdirs_final = [str(Path(i).as_posix()) for i in include_subdirs]

if not include_subdirs and include_root:
return get_relpaths_from_dir(dir_path,
extension=extension)
return get_relpaths_from_dir(dir_path, extension=extension)
elif not include_subdirs and not include_root:
return [i for i in get_relpaths_from_dir(dir_path,
extension=extension)
if str(i.parent.as_posix()) != '.']
return [
i
for i in get_relpaths_from_dir(dir_path, extension=extension)
if str(i.parent.as_posix()) != "."
]
else:
if include_root:
return [i for i in get_relpaths_from_dir(dir_path,
extension=extension)
if str(i.parent.as_posix())
in include_subdirs_final + ['.']]
return [
i
for i in get_relpaths_from_dir(dir_path, extension=extension)
if str(i.parent.as_posix()) in include_subdirs_final + ["."]
]
else:
return [i for i in get_relpaths_from_dir(dir_path,
extension=extension)
if str(i.parent.as_posix())
in include_subdirs_final]
return [
i
for i in get_relpaths_from_dir(dir_path, extension=extension)
if str(i.parent.as_posix()) in include_subdirs_final
]


def _get_valid_filepaths_by_ext_set(dirpath: Path, *,
exts: set[str]):
all_files = [p.relative_to(dirpath)
for p in Path(dirpath).glob("**/*")
if p.suffix in exts]
def _get_valid_filepaths_by_ext_set(dirpath: Path, *, exts: set[str]):
all_files = [
p.relative_to(dirpath) for p in Path(dirpath).glob("**/*") if p.suffix in exts
]
return all_files


Expand All @@ -98,15 +104,13 @@ def _get_shortest_path_by_filename(relpaths_list: list[Path]) -> dict[str, Path]

# get indices of dupe 'filename w/ ext':
_, inverse_ix, counts = np.unique(
np.array(all_file_names_list),
return_inverse=True,
return_counts=True,
axis=0)
np.array(all_file_names_list), return_inverse=True, return_counts=True, axis=0
)
dupe_names_ix = np.where(counts[inverse_ix] > 1)[0]

# get shortest paths via mask:
shortest_paths_arr = np.array(all_file_names_list, dtype=object)
shortest_paths_arr[dupe_names_ix] = np.array(
[str(fpath)
for fpath in relpaths_list])[dupe_names_ix]
return {fn: path for fn, path in zip(shortest_paths_arr, relpaths_list)}
[fpath.as_posix() for fpath in relpaths_list]
)[dupe_names_ix]
return dict(zip(shortest_paths_arr, relpaths_list))
Loading