Skip to content

Commit 73751ae

Browse files
committed
Implement SQLAlchemy
1 parent 25adb36 commit 73751ae

File tree

10 files changed

+625
-271
lines changed

10 files changed

+625
-271
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,11 @@ Exceptions will be written into `waybackup_error.log` (each run overwrites the f
326326
<br>
327327
<br>
328328

329+
## Future ideas (long run)
330+
331+
- More module functionality
332+
- Docker UI
333+
329334
## Contributing
330335

331336
I'm always happy for some feature requests to improve the usability of this tool.

pyproject.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ packages = ["pywaybackup"]
77

88
[project]
99
name = "pywaybackup"
10-
version = "4.0.0"
10+
version = "4.1.0"
1111
description = "Query and download archive.org as simple as possible."
1212
authors = [
1313
{ name = "bitdruid", email = "bitdruid@outlook.com" }
@@ -16,8 +16,7 @@ license = { file = "LICENSE" }
1616
readme = "README.md"
1717
requires-python = ">=3.8"
1818
dependencies = [
19-
"pysqlite3-binary==0.5.4; sys_platform == 'linux'",
20-
"pysqlite-binary; sys_platform == 'win32'",
19+
"SQLAlchemy==2.0.43",
2120
"requests==2.32.3",
2221
"tqdm==4.67.1",
2322
"python-magic==0.4.27; sys_platform == 'linux'",

pywaybackup/PyWayBackup.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
import sys
1+
import multiprocessing
22
import os
3-
import time
43
import signal
5-
import multiprocessing
6-
from pywaybackup.helper import url_split, sanitize_filename
4+
import sys
5+
import time
76
from importlib.metadata import version
87

98
import pywaybackup.archive_save as archive_save
9+
from pywaybackup.archive_download import DownloadArchive
1010
from pywaybackup.db import Database as db
11-
from pywaybackup.Verbosity import Verbosity as vb
1211
from pywaybackup.Exception import Exception as ex
12+
from pywaybackup.files import CDXfile, CDXquery, CSVfile, File
13+
from pywaybackup.helper import sanitize_filename, url_split
1314
from pywaybackup.SnapshotCollection import SnapshotCollection
14-
from pywaybackup.archive_download import DownloadArchive
15-
from pywaybackup.files import CDXquery, CDXfile, CSVfile, File
15+
from pywaybackup.Verbosity import Verbosity as vb
1616

1717

1818
class _Status:
@@ -401,9 +401,9 @@ def paths(self, rel: bool = False) -> dict:
401401
"""
402402
files = {
403403
"snapshots": os.path.join(self._output, self._domain),
404-
"cdxfile": self._cdxfile,
404+
"cdxfile": self._cdxfile.filepath,
405405
"dbfile": self._dbfile,
406-
"csvfile": self._csvfile,
406+
"csvfile": self._csvfile.filepath,
407407
"log": self._log,
408408
"debug": self._debug,
409409
}
@@ -475,10 +475,10 @@ def stop(self) -> bool:
475475
return False
476476

477477
def _startup(self):
478-
if db.QUERY_EXIST:
478+
if db.query_exist:
479479
self._status.task = "resuming"
480480
vb.write(
481-
content=f"\nDOWNLOAD job exist - processed: {db.QUERY_PROGRESS}\nResuming download... (to reset the job use '--reset')"
481+
content=f"\nDOWNLOAD job exist - processed: {db.query_progress}\nResuming download... (to reset the job use '--reset')"
482482
)
483483

484484
if not self._silent:

pywaybackup/Snapshot.py

Lines changed: 114 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,35 @@
11
import os
2+
import threading
23

3-
from pywaybackup.db import Database
4+
from pywaybackup.db import Database, select, update, waybackup_snapshots
45
from pywaybackup.helper import url_split
56

67

78
class Snapshot:
89
"""
9-
If a relevant property of the snapshot is modified, the change will be pushed to the database.
10+
Represents a single snapshot entry and manages its state and persistence.
1011
11-
- _redirect_url
12-
- _redirect_timestamp
13-
- _response
14-
- _file
12+
When a relevant property of the snapshot is modified, the change is automatically
13+
pushed to the database:
14+
- redirect_url
15+
- redirect_timestamp
16+
- response_status
17+
- file
18+
19+
Thread-safe for SQLite operations using a lock.
1520
"""
1621

22+
__sqlite_lock = threading.Lock()
23+
1724
def __init__(self, db: Database, output: str, mode: str):
25+
"""
26+
Initialize a Snapshot instance and fetch its database row if available.
27+
28+
Args:
29+
db (Database): Database connection/session manager.
30+
output (str): Output directory for downloaded files.
31+
mode (str): Download mode ('first', 'last', or default).
32+
"""
1833
self._db = db
1934
self.output = output
2035
self.mode = mode
@@ -26,52 +41,78 @@ def __init__(self, db: Database, output: str, mode: str):
2641

2742
self._row = self.fetch()
2843
if self._row:
29-
self.counter = self._row["counter"]
30-
self.timestamp = self._row["timestamp"]
31-
self.url_archive = self._row["url_archive"]
32-
self.url_origin = self._row["url_origin"]
33-
self.redirect_url = self._row["redirect_url"]
34-
self.redirect_timestamp = self._row["redirect_timestamp"]
35-
self.response_status = self._row["response"]
36-
self.file = self._row["file"]
44+
self.scid = self._row.scid
45+
self.counter = self._row.counter
46+
self.timestamp = self._row.timestamp
47+
self.url_archive = self._row.url_archive
48+
self.url_origin = self._row.url_origin
49+
self.redirect_url = self._row.redirect_url
50+
self.redirect_timestamp = self._row.redirect_timestamp
51+
self.response_status = self._row.response
52+
self.file = self._row.file
3753
else:
3854
self.counter = False
3955

4056
def fetch(self):
4157
"""
42-
Get a snapshot-row from the snapshot table with response NULL. (not processed)
58+
Fetch a snapshot row from the database with response=NULL (not processed).
59+
Uses row locking to prevent concurrent workers from processing the same row.
60+
61+
Returns:
62+
waybackup_snapshots or None: The next unprocessed snapshot row, or None if none available.
4363
"""
4464
# mark as locked for other workers // only visual because get_snapshot fetches by NULL
45-
self._db.cursor.execute(
46-
"""
47-
UPDATE snapshot_tbl
48-
SET response = 'LOCK'
49-
WHERE rowid = (
50-
SELECT rowid FROM snapshot_tbl
51-
WHERE response IS NULL
52-
LIMIT 1
53-
)
54-
RETURNING rowid, *;
55-
"""
56-
)
57-
row = self._db.cursor.fetchone()
58-
self._db.conn.commit()
59-
return row
65+
# prevent another worker from fetching between LOCK-update (for sqlite by threading.Lock, else lock row)
66+
67+
def __on_sqlite():
68+
if self._db.session.bind.dialect.name == "sqlite":
69+
return True
70+
return False
71+
72+
def __get_row():
73+
with self._db.session.begin():
74+
row = self._db.session.execute(
75+
select(waybackup_snapshots)
76+
.where(waybackup_snapshots.response.is_(None))
77+
.order_by(waybackup_snapshots.scid)
78+
.limit(1)
79+
.with_for_update(skip_locked=True)
80+
).scalar_one_or_none()
81+
82+
if row is None:
83+
return None
84+
85+
row.response = "LOCK"
86+
87+
return row
88+
89+
if __on_sqlite():
90+
with self.__sqlite_lock:
91+
return __get_row()
92+
else:
93+
return __get_row()
6094

6195
def modify(self, column, value):
6296
"""
63-
Modify the snapshot in the database.
97+
Update a column value for this snapshot in the database.
98+
99+
Args:
100+
column (str): Name of the column to update.
101+
value: New value to set for the column.
64102
"""
65-
query = f"UPDATE snapshot_tbl SET {column} = ? WHERE counter = ?"
66-
self._db.cursor.execute(query, (value, self.counter))
67-
self._db.conn.commit()
103+
column = getattr(waybackup_snapshots, column)
104+
self._db.session.execute(update(waybackup_snapshots).where(waybackup_snapshots.scid == self.scid).values({column: value}))
105+
self._db.session.commit()
68106

69107
def create_output(self):
70108
"""
71-
Create a file path for the snapshot.
109+
Generate the file path for the snapshot download.
72110
73-
- If MODE_LAST or MODE_FIRST is enabled, the path does not include the timestamp.
74-
- Otherwise, include the timestamp in the path.
111+
If mode is 'first' or 'last', the path does not include the timestamp.
112+
Otherwise, the timestamp is included in the path.
113+
114+
Returns:
115+
str: Absolute path to the output file for the snapshot.
75116
"""
76117
domain, subdir, filename = url_split(self.url_archive.split("id_/")[1], index=True)
77118

@@ -86,43 +127,79 @@ def create_output(self):
86127

87128
@property
88129
def redirect_url(self):
130+
"""
131+
str: The redirect URL for this snapshot, if any.
132+
"""
89133
return self._redirect_url
90134

91135
@redirect_url.setter
92136
def redirect_url(self, value):
137+
"""
138+
Set the redirect URL and update the database.
139+
140+
Args:
141+
value (str): The new redirect URL.
142+
"""
93143
if self.redirect_timestamp is None and value is None:
94144
return
95145
self._redirect_url = value
96146
self.modify(column="redirect_url", value=value)
97147

98148
@property
99149
def redirect_timestamp(self):
150+
"""
151+
str: The timestamp of the redirect, if any.
152+
"""
100153
return self._redirect_timestamp
101154

102155
@redirect_timestamp.setter
103156
def redirect_timestamp(self, value):
157+
"""
158+
Set the redirect timestamp and update the database.
159+
160+
Args:
161+
value (str): The new redirect timestamp.
162+
"""
104163
if self.redirect_url is None and value is None:
105164
return
106165
self._redirect_timestamp = value
107166
self.modify(column="redirect_timestamp", value=value)
108167

109168
@property
110169
def response_status(self):
170+
"""
171+
str: The HTTP response/status for this snapshot.
172+
"""
111173
return self._response_status
112174

113175
@response_status.setter
114176
def response_status(self, value):
177+
"""
178+
Set the response status and update the database.
179+
180+
Args:
181+
value (str): The new response status.
182+
"""
115183
if self.response_status is None and value is None:
116184
return
117185
self._response_status = value
118186
self.modify(column="response", value=value)
119187

120188
@property
121189
def file(self):
190+
"""
191+
str: The file path for the downloaded snapshot.
192+
"""
122193
return self._file
123194

124195
@file.setter
125196
def file(self, value):
197+
"""
198+
Set the file path and update the database.
199+
200+
Args:
201+
value (str): The new file path.
202+
"""
126203
if self.file is None and value is None:
127204
return
128205
self._file = value

0 commit comments

Comments
 (0)