diff --git a/AHK_inputs/Arrow1.PNG b/AHK_inputs/Arrow1.PNG new file mode 100644 index 0000000..667b26f Binary files /dev/null and b/AHK_inputs/Arrow1.PNG differ diff --git a/AHK_inputs/Discover1.PNG b/AHK_inputs/Discover1.PNG new file mode 100644 index 0000000..b819eaa Binary files /dev/null and b/AHK_inputs/Discover1.PNG differ diff --git a/AHK_inputs/EndPage1.png b/AHK_inputs/EndPage1.png new file mode 100644 index 0000000..3f1b657 Binary files /dev/null and b/AHK_inputs/EndPage1.png differ diff --git a/Arts.ahk b/Arts.ahk new file mode 100644 index 0000000..f5cbefd --- /dev/null +++ b/Arts.ahk @@ -0,0 +1,118 @@ + +; Loops through artist pages and copies links +; Requires "Link grabber" extension +; Requires open excel file to copy links to +; 1080p monitor - additional resolutions will require changing coord +; Starting point - https://artsandculture.google.com/category/artist?tab=az&date=1920&pr=A + +SetWorkingDir %A_ScriptDir% + +Esc::ExitApp + +^j:: +Loop, 1000 +{ + ;Searches for end of page, moves to next letter alphabetically if so + ImageSearch, FoundX, FoundY, 1880, 995, 1935, 1040, C:\Users\XXXX\AHK_inputs\EndPage1.PNG + if (ErrorLevel = 2) + MsgBox Could not conduct the search. + else if (ErrorLevel <> 1) + { + FoundYY = 1 + Send {Home} + sleep, 1000 + ; Loops to the next alphabetical letter + MouseMove, 1075, 378 + Mouseclick + sleep, 2500 + Send {Down 4} + } + +MouseMove, 360, 442, 60 +Send, ^{LButton} +sleep, 600 +Loop, 4 +{ +MouseMove, 300, 0, 10, R +Send, ^{LButton} +sleep, 900 +} +Gosub, Label1 +sleep, 600 +Send {Down 6} +sleep, 1500 +} +return + +Label1: +Loop, 5 +{ +send, ^{tab} +CoordMode Pixel ; Interprets the coordinates below as relative to the screen rather than the active window. +FoundYY = 0 +while FoundYY = 0 +{ +ImageSearch, FoundX, FoundY, 0, 0, A_ScreenWidth, A_ScreenHeight, C:\Users\XXXX\AHK_inputs\Discover1.PNG +if (ErrorLevel = 2) + MsgBox Could not conduct the search. +else if (ErrorLevel = 1) + Send {Down 6} +else if (FoundY > 500) + Send {Down 6} +else + { + ;MsgBox The icon was found at %FoundX%x%FoundY%. + FoundYY = 1 + MouseMove, FoundX, FoundY + ; Following *must* be changed to the coordates of the link grabber extension + MouseMove, 1600, 442, 10, R + } +sleep, 500 +} +Arrow = 1 +MouseMove, -100, -100, 0, R +MouseGetPos, xpos, ypos +MouseMove, 100, 100, 0, R +xpos2:= xpos+300 +ypos2:= ypos+300 +ImageSearch, FoundX2, FoundY2, %xpos%, %ypos%, %xpos2%, %ypos2%, C:\Users\XXXX\AHK_inputs\Arrow1.PNG +if (ErrorLevel = 1 or ErrorLevel = 2) + Arrow = 0 +else + Arrow = 1 +while Arrow = 1 +{ +Mouseclick +MouseMove, -100, -100, 0, R +MouseGetPos, xpos, ypos +xpos2:= xpos+300 +ypos2:= ypos+300 +sleep, 1000 +;ImageSearch, FoundX2, FoundY2, 0, 0, A_ScreenWidth, A_ScreenHeight, C:\Users\XXXX\AHK_inputs\Arrow1.PNG +ImageSearch, FoundX2, FoundY2, %xpos%, %ypos%, %xpos2%, %ypos2%, C:\Users\XXXX\AHK_inputs\Arrow1.PNG +if (ErrorLevel = 1 or ErrorLevel = 2) +{ + ;msgbox Done! + Arrow = 0 +} +else + Arrow = 1 +MouseMove, 100, 100, 0, R +} +MouseMove, 1580, 70 +Mouseclick +sleep, 1000 +MouseMove, 1800, 200 +Mouseclick +sleep, 300 +WinActivate, ahk_class XLMAIN +sleep, 800 +Clip0 = %ClipBoardAll% +ClipBoard = %ClipBoard% ; Convert to text +SendInput ^v^{Down}{Down} +sleep, 2000 +WinActivate, ahk_class Chrome_WidgetWin_1 +SendInput ^w +SendInput ^w +} +Return diff --git a/Create Artist Folders.bat b/Create Artist Folders.bat new file mode 100644 index 0000000..73b1143 --- /dev/null +++ b/Create Artist Folders.bat @@ -0,0 +1,16 @@ +@echo off +setlocal EnableExtensions DisableDelayedExpansion +set "SourceDir=C:\Users\XXXX" +set "DestDir=C:\Users\XXXX\Artists3" + +for /F "eol=| delims=" %%A in ('dir /B /A-D-H "%SourceDir%\*-*.jpg" 2^>nul') do ( + set "string=%%~A" + setlocal enabledelayedexpansion + SET "end=!string:* - =!" + FOR /F "delims=" %%G IN ("!end!") do set "begin=!string: - %%~G=!" + md "%DestDir%\!begin!" 2>nul + move /Y "%SourceDir%\%%A" "%DestDir%\!begin!\" + endlocal +) + +endlocal \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 145d4d3..96e6ca2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,9 @@ pycryptodome -lxml -Pillow -aiohttp \ No newline at end of file +lxml~=4.5.1 +Pillow~=7.1.2 +aiohttp~=3.6.2 +pyexiv2~=2.3.0 +cssselect +unidecode +pandas +goto \ No newline at end of file diff --git a/tile_fetch.py b/tile_fetch.py index 1ddcfec..74702e5 100755 --- a/tile_fetch.py +++ b/tile_fetch.py @@ -8,13 +8,20 @@ import re import shutil import string +import unidecode import urllib.parse import urllib.request +import pandas as pd +from random import randint +from time import sleep + from pathlib import Path import aiohttp +import html as html_1 from PIL import Image -from lxml import etree +from lxml import etree, html +from pyexiv2 import Image as TaggedImage import async_tile_fetcher from decryption import decrypt @@ -35,13 +42,47 @@ def compute_url(path, token, x, y, z): url_bytes = b'https://lh3.googleusercontent.com/%s=x%d-y%d-z%d-t%s' % (path, x, y, z, signature) return url_bytes.decode('utf-8') +def remove(value, deletechars): + for c in deletechars: + value = value.replace(c,'') + return value; class ImageInfo(object): - RE_URL_PATH_TOKEN = re.compile(rb']\r?\n,"(//[^"/]+/[^"/]+)",(?:"([^"]+)"|null)', re.MULTILINE) + RE_URL_PATH_TOKEN = re.compile(rb'],"(//[^"/]+/[^"/]+)",(?:"([^"]+)"|null)', re.MULTILINE) def __init__(self, url): page_source = urllib.request.urlopen(url).read() - + ## Finding author and title from outside the metadata - this will be a cleaner way to title the image output + page_source_html = html.fromstring(page_source) + page_source_bytes = etree.tostring(page_source_html) + page_source_str = page_source_bytes.decode('utf-8') + artist_search_str = "categoryId=artist\">" + title_search_str = "UEmoBd(FQhpwf);R7KM6d:r1oohb;\" data-title=\"" + title_search_end = "\" data-galabel=\"asset-viewer" + page_source_str_artist = page_source_str.split(artist_search_str,9) + page_source_str_title = page_source_str.split(title_search_str,9) + self.page_source_artist = "" + try: + page_source_artist = unidecode.unidecode(html_1.unescape(page_source_str_artist[1].split("<",1)[0])) + self.page_source_artist = remove(page_source_artist, '\/:*?"<>|') + self.source_artist_flag = 1 + except: + self.source_artist_flag = 0 + page_source_title = unidecode.unidecode(html_1.unescape(page_source_str_title[1].split(title_search_end,1)[0])) + self.page_source_title = remove(page_source_title, '\/:*?"<>|') + self.page_source_title = self.page_source_title[:230] + while self.page_source_title[-1:] == "." or self.page_source_title[-1:] == " ": + self.page_source_title = self.page_source_title.rstrip(".") + self.page_source_title = self.page_source_title.rstrip(" ") + + self.metadata = {'Xmp.xmp.URL': url} + for item in html.fromstring(page_source).cssselect('[id^="metadata"] li'): + text = item.text_content() + # XMP metadata needs to be under the Xmp.xml section + # removes and non-word character from the title as they invalid for metadata tag names + key = 'Xmp.xmp.' + re.sub(r'\W', '', text[:text.find(':')]) + self.metadata[key] = text[text.find(':') + 1:].strip() + match = self.RE_URL_PATH_TOKEN.search(page_source) if match is None: raise ValueError("Unable to find google arts image token") @@ -51,7 +92,16 @@ def __init__(self, url): self.token = token or b'' url_path = urllib.parse.unquote_plus(urllib.parse.urlparse(url).path) self.image_slug, image_id = url_path.split('/')[-2:] - self.image_name = '%s - %s' % (string.capwords(self.image_slug.replace("-"," ")), image_id) + # self.image_name = unidecode.unidecode(string.capwords(self.image_slug.replace("-"," "))) + # self.image_name = self.image_name.replace(":","-") + # self.image_name = self.image_name.replace("\"","\'") + # self.image_name = unidecode.unidecode(self.image_name) + # self.image_name = remove(self.image_name, '\/:*?"<>|') + # self.image_name = self.image_name[:250] + self.image_name = self.page_source_title + self.image_id = image_id + + # self.image_name_2 = '%s - %s' % (string.capwords(self.image_slug.replace("-"," ")), image_id) meta_info_url = "https:{}=g".format(url_no_proto.decode('utf8')) meta_info_tree = etree.fromstring(urllib.request.urlopen(meta_info_url).read()) @@ -71,7 +121,12 @@ def __repr__(self): '\n'.join(map(str, self.tile_info)) ) - + def remove(value, deletechars): + for c in deletechars: + value = value.replace(c,'') + return value; + + class ZoomLevelInfo(object): def __init__(self, img_info, level_num, attrs): self.num = level_num @@ -105,6 +160,7 @@ async def fetch_tile(session, image_info, tiles_dir, x, y, z): async def load_tiles(info, z=-1, outfile=None, quality=90): + if z >= len(info.tile_info): print( 'Invalid zoom level {z}. ' @@ -117,9 +173,13 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): z %= len(info.tile_info) # keep 0 <= z < len(tile_info) level = info.tile_info[z] + PNG_Output = 0 + if info.tile_info[z].size[0] > 65535 or info.tile_info[z].size[1] > 65535: + PNG_Output = 1 + img = Image.new(mode="RGB", size=level.size) - - tiles_dir = Path(info.image_name) + tiles_dir = Path(info.page_source_title) + tiles_dir.mkdir(exist_ok=True) async with aiohttp.ClientSession() as session: @@ -131,18 +191,72 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): ] print("Downloading tiles...") tiles = await async_tile_fetcher.gather_progress(awaitable_tiles) - + for x, y, encrypted_bytes in tiles: clear_bytes = decrypt(encrypted_bytes) tile_img = Image.open(io.BytesIO(clear_bytes)) img.paste(tile_img, (x * info.tile_width, y * info.tile_height)) print("Downloaded all tiles. Saving...") - final_image_filename = outfile or (info.image_name + '.jpg') - img.save(final_image_filename, quality=quality, subsampling=0) + + ## Try to extract author name ("Creator"/"Painter") and date ("Date Created"/"Date") from metadata + author = "0" + date = "" + for key, value in info.metadata.items(): + if info.source_artist_flag == 0: + if key.lower() == "xmp.xmp.creator" or key.lower() == "xmp.xmp.painter" or key.lower() == "xmp.xmp.illustrator": + # Avoiding non-ASCII characters in the painter/creator name. This runs if artist name could not be found from the page source + author = unidecode.unidecode(value) + author = author.replace("?","") + author = author.replace("\/","-") + author = author.replace("/","-") + author = author.replace("|","-") + author = author.replace("\\","-") + author = author.replace(":","-") + author = author.replace('"','') + author = author.replace('[','(') + author = author.replace(']',')') + author = author.replace("\n"," ") + author = author.replace("*"," ") + author = author.replace('<','') + info.page_source_artist = author[:30] + if key.lower() == "xmp.xmp.date" or key.lower() == "xmp.xmp.datecreated": + # Avoiding "/" in the date (year), especially when multiple dates are given. Not deprecating with author as date is often missing from the page source + date = unidecode.unidecode(value) + date = date.replace('/','-') + date = date.replace('?','') + date = date.replace('\\','') + date = date.replace("\n"," ") + date = date.replace(':','') + date = date.replace('[','(') + date = date.replace(']',')') + date = date.replace(';',',') + date = date.replace('"','') + date = date[:25] + + if(info.page_source_artist) == "": + info.page_source_artist = "No Author" + + title_max_length = 245 - len(info.page_source_artist + ' - ' + date + ' - ' + ' - ' + info.image_id)+3 + alt_image_filename = (info.page_source_artist + ' - ' + date + ' - ' + info.page_source_title[:(title_max_length)] + ' - ' + info.image_id + '.jpg') + img.save(alt_image_filename, quality=90, subsampling=0, optimize=True) + print("Adding metadata...") + xmp_file_obj = TaggedImage(alt_image_filename) + try: + xmp_file_obj.modify_xmp(info.metadata) + except: + print("Cannot write all metadata at once; writing tag by tag...") + # writes key:value one at a time, which is heavier on writes, + # but far more robust. + for key, value in info.metadata.items(): + try: + xmp_file_obj.modify_xmp({key: value}) + except RuntimeError as e: + print(f'Failed to add add XMP tag with key "{key}" with value "{value}"') + print(repr(e)) shutil.rmtree(tiles_dir) - print("Saved the result as " + final_image_filename) - + print("Saved the result as " + alt_image_filename) + def main(): import argparse @@ -154,29 +268,134 @@ def main(): parser.add_argument('--outfile', type=str, nargs='?', help='The name of the file to create.') parser.add_argument('--quality', type=int, nargs='?', default=90, - help='Compression level from 0-95. Higher is better.') + help='Compression level from 0-95. Higher is better quality, larger file size.') + parser.add_argument('-a','--add_url', type=str, nargs='?', help='Add a new URL to the queue.', + action='store', dest='add_url') + parser.add_argument('-b', '--batch-add', type=str, nargs=1, help="Adds a list of URL's to the queue from a csv file.", action="store", dest='csv') + parser.add_argument('-d', '--download', help="Downloads all remaining links in the queue.",action="store_true", default=None) args = parser.parse_args() - + + assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" - url = args.url or input("Enter the url of the image: ") - - print("Downloading image meta-information...") - image_info = ImageInfo(url) - - zoom = args.zoom - if zoom is None: - print(image_info) - while True: + + if args.csv or args.add_url or args.download: + df = None + try: + df = pd.read_csv("dlcache", index_col=0) + except: + print("No cache found. Setting up a new one.") + df = pd.DataFrame(columns=['url', 'quality', 'downloaded']) + + if args.csv: + url_df = pd.read_csv(args.csv[0]) + for u in url_df['url']: + print("######### Processing '{}'".format(u)) + img_id = u[-(len(u)-u.rfind("/")-1):] + + if not (img_id in df.index): + assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" + df.loc[img_id] = {'url':u, 'quality':args.quality, "downloaded":False} + print("######### Added to queue.") + else: + print("Image already in list. Ignoring the URL.") + + if args.add_url: + print("######### Processing '{}'".format(args.add_url)) + u = args.add_url + img_id = u[-(len(u)-u.rfind("/")-1):] + if not (img_id in df.index): + df.loc[img_id] = {'url':args.add_url, 'quality':args.quality, "downloaded":False} + print("######### Added to queue.") + else: + print("Image already in list. Ignoring the URL.") + + if args.download: + + print("######### Starting download") + for row in df.loc[df['downloaded'] == False].iterrows(): + print(row[1]['url']) + img_info = None + + #assert 0 <= ImageInfo(row[1]['quality']) <= 95, "Image quality must be between 0 and 95" + try: + img_info = ImageInfo(row[1]['url']) + + if img_info: + if args.zoom: + zoom = args.zoom + try: + assert 0 <= zoom < len(img_info.tile_info) + except: + print('No valid zoom level.') + else: + zoom = len(img_info.tile_info)-1 + print("Defaulting to highest zoom level ({}).".format(zoom)) + + ## Ensuring image resolution fits in JPEG + while img_info.tile_info[zoom].size[0] > 65535 or img_info.tile_info[zoom].size[1] > 65535: + print( + 'Zoom level {r} too high for JPEG output, using next zoom level {next_zoom} instead'.format( + r=zoom, + next_zoom=zoom-1) + ) + zoom = zoom-1 + print("Using zoom level {}.".format(zoom)) + try: + coro = load_tiles(img_info, zoom, img_info.image_name, row[1]['quality']) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) + except: + try: + coro = load_tiles(img_info, zoom, img_info.image_name, row[1]['quality']) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) + except: + coro = load_tiles(img_info, zoom, img_info.image_name, row[1]['quality']) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) + except Exception: + pass try: - zoom = int(input("Which level do you want to download? ")) - assert 0 <= zoom < len(image_info.tile_info) - break - except (ValueError, AssertionError): - print("Not a valid zoom level.") - - coro = load_tiles(image_info, zoom, args.outfile, args.quality) - loop = asyncio.get_event_loop() - loop.run_until_complete(coro) + df.at[img_info.image_id, 'downloaded'] = True + df.to_csv('dlcache') + except: + print("Archive recording not successful") + print("Download successful. Sleeping before next download...") + sleep(randint(15,20)) + print("######### Finished download") + df.to_csv('dlcache') + + if args.csv is None and args.add_url is None and args.download is None: + url = args.url or input("Enter the url of the image: ") + + print("Downloading image meta-information...") + image_info = ImageInfo(url) + + zoom = args.zoom + if zoom is None: + print(image_info) + while True: + try: + zoom = int(input("Which level do you want to download? Choose 11 to default to largest JPEG-compliant level: ")) + if zoom == 11: + ## Ensuring image resolution fits in JPEG. Otherwise, image will be saved as PNG, which does not have max resolution limits (but does not allow for metadata embedding). + zoom = len(img_info.tile_info)-1 + while image_info.tile_info[zoom].size[0] > 65535 or image_info.tile_info[zoom].size[1] > 65535: + print( + 'Zoom level {r} too high for JPEG output, using next zoom level {next_zoom} instead'.format( + r=zoom, + next_zoom=zoom-1) + ) + zoom = zoom-1 + else: + assert 0 <= zoom < len(image_info.tile_info) + break + except (ValueError, AssertionError): + print("Not a valid zoom level.") + + coro = load_tiles(image_info, zoom, args.outfile, args.quality) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) if __name__ == '__main__':