From 482110cea4d75098b1b9f28b5e8463b26da87596 Mon Sep 17 00:00:00 2001 From: Moothes Date: Mon, 9 Feb 2026 20:37:35 +0800 Subject: [PATCH 1/3] Add files via upload --- extract_images_and_pack2h5.py | 79 +++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/extract_images_and_pack2h5.py b/extract_images_and_pack2h5.py index 2b9182f..de61ebd 100755 --- a/extract_images_and_pack2h5.py +++ b/extract_images_and_pack2h5.py @@ -7,6 +7,7 @@ import argparse from Aslide import Slide from PIL import Image +import time COLOR_CORRECTION_FLAG = False @@ -46,6 +47,7 @@ def get_wsi_handle(wsi_path): def read_images(arg): + st = time.time() h5_path, save_path, wsi_path = arg if wsi_path is None: return @@ -65,10 +67,18 @@ def read_images(arg): level = h5['coords'].attrs['patch_level'] size = h5['coords'].attrs['patch_size'] + time_load_h5 = time.time() - st + st = time.time() + #print(f'Loaded h5 file: {h5_path} with {_num} patches, patch level: {level}, patch size: {size} using {time_load_h5:.2f}s') + wsi_handle = get_wsi_handle(wsi_path) total_number_of_patches = len(coors) allowed_corrupted = int(DROP_SLIDE_THRESHOLD * total_number_of_patches) corrupted_count = 0 + + time_load_wsi = time.time() - st + st = time.time() + #print(f'Loaded WSI file: {wsi_path} with {total_number_of_patches} patches, allowed corrupted: {allowed_corrupted} using {time_load_wsi:.2f}s') try: with h5py.File(save_path+'.temp', 'w') as h5_file: # create dataset for patches @@ -81,6 +91,10 @@ def read_images(arg): compression_opts=6 ) + time_create_h5 = time.time() - st + st = time.time() + #print(f'Created new h5 file: {save_path} using {time_create_h5:.2f}s') + # process each image and store as JPEG for i, (x, y) in enumerate(coors): # some tiles may be corrupted, if failed, use white image @@ -100,53 +114,64 @@ def read_images(arg): # store JPEG byte stream in dataset patches_dataset[i] = np.frombuffer(jpeg_bytes, dtype=np.uint8) + + #time_load_each_h5_data = time.time() - st + #st = time.time() + #print(f'Loaded h5 data and processed patches for {wsi_path} using {time_load_each_h5_data:.2f}s') + + time_load_h5_data = time.time() - st + st = time.time() + #print(f'Loaded h5 data and processed {len(coors)} patches for {wsi_path} using {time_load_h5_data:.2f}s') + os.rename(save_path+'.temp', save_path) - print(f"{wsi_path} finished!") + time_pack = time.time() - st + st = time.time() + print(f"{os.path.basename(wsi_path)} with {total_number_of_patches} patches finished using {time_load_h5:.2f}/{time_load_wsi:.2f}s to load h5/wsi, {time_create_h5:.2f}/{time_load_h5_data:.2f}s to create/load h5 data, {time_pack:.2f}s to save h5, total time: {time_load_h5 + time_load_wsi + time_create_h5 + time_load_h5_data + time_pack:.2f}s") + except Exception as e: print(f'{wsi_path} failed to process: {e}') os.remove(save_path+'.temp') def get_wsi_path(wsi_root, h5_files, wsi_format): + st = time.time() kv = {} # Convert wsi_format to list if it's not already - formats = [wsi_format] if isinstance(wsi_format, str) else wsi_format + formats = wsi_format.split(';') if isinstance(wsi_format, str) else wsi_format # auto search path all_paths = glob.glob(os.path.join(wsi_root, '**'), recursive=True) # Check for any of the formats all_paths = [i for i in all_paths if any(f'.{fmt}' in i for fmt in formats)] - for h in h5_files: - prefix = os.path.splitext(h)[0] - # Try each format until we find a match + # Create a dictionary mapping WSI filenames to their full paths for quick lookup + wsi_path_map = {os.path.basename(p): p for p in all_paths} + # os.path.basename will give the filename with extension, which should match the prefix of h5 files if they are named consistently + + wsi_paths = [] + matched_h5 = 0 + + for h5_file in h5_files: + prefix = os.path.splitext(h5_file)[0] + found_path = None + # Try each format until a match is found in the map for fmt in formats: - wsi_file_name = f'{prefix}.{fmt}' - p = [i for i in all_paths if wsi_file_name == os.path.split(i)[-1]] - if len(p) == 1: - kv[prefix] = os.path.split(p[0])[0] + wsi_filename = f'{prefix}.{fmt}' + if wsi_filename in wsi_path_map: + found_path = wsi_path_map[wsi_filename] break - else: # No break occurred, no match found + + if found_path: + wsi_paths.append(found_path) + matched_h5 += 1 + else: + wsi_paths.append(None) print('failed to process:', prefix) - kv[prefix] = None - wsi_paths = [] - for h in h5_files: - prefix = os.path.splitext(h)[0] - r = kv[prefix] - if r is None: - p = None - else: - # Find which format was actually matched - matched_format = None - for fmt in formats: - if os.path.exists(os.path.join(r, f'{prefix}.{fmt}')): - matched_format = fmt - break - p = os.path.join(r, f'{prefix}.{matched_format}') if matched_format else None - - wsi_paths.append(p) + all_h5 = len(h5_files) + failed_h5 = all_h5 - matched_h5 + print(f'Result of matching {all_h5} h5 patches with original data using {time.time()-st}s: {matched_h5} success and {failed_h5} fail.') return wsi_paths From 346b0f0e2d8e89d025b83d22a5d1caf9101ef18b Mon Sep 17 00:00:00 2001 From: Moothes Date: Mon, 9 Feb 2026 20:37:49 +0800 Subject: [PATCH 2/3] Add files via upload From 989919f3862fc3dd7ed7d550a166c7553ca9fa87 Mon Sep 17 00:00:00 2001 From: Moothes Date: Mon, 9 Feb 2026 20:42:46 +0800 Subject: [PATCH 3/3] Add files via upload --- extract_images_and_pack2h5.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/extract_images_and_pack2h5.py b/extract_images_and_pack2h5.py index de61ebd..0a7856d 100755 --- a/extract_images_and_pack2h5.py +++ b/extract_images_and_pack2h5.py @@ -69,7 +69,6 @@ def read_images(arg): time_load_h5 = time.time() - st st = time.time() - #print(f'Loaded h5 file: {h5_path} with {_num} patches, patch level: {level}, patch size: {size} using {time_load_h5:.2f}s') wsi_handle = get_wsi_handle(wsi_path) total_number_of_patches = len(coors) @@ -78,7 +77,6 @@ def read_images(arg): time_load_wsi = time.time() - st st = time.time() - #print(f'Loaded WSI file: {wsi_path} with {total_number_of_patches} patches, allowed corrupted: {allowed_corrupted} using {time_load_wsi:.2f}s') try: with h5py.File(save_path+'.temp', 'w') as h5_file: # create dataset for patches @@ -93,7 +91,6 @@ def read_images(arg): time_create_h5 = time.time() - st st = time.time() - #print(f'Created new h5 file: {save_path} using {time_create_h5:.2f}s') # process each image and store as JPEG for i, (x, y) in enumerate(coors): @@ -115,13 +112,13 @@ def read_images(arg): # store JPEG byte stream in dataset patches_dataset[i] = np.frombuffer(jpeg_bytes, dtype=np.uint8) + ### use the following code to measure time for processing each patch, but it will significantly increase the total time, so we comment it out for now #time_load_each_h5_data = time.time() - st #st = time.time() #print(f'Loaded h5 data and processed patches for {wsi_path} using {time_load_each_h5_data:.2f}s') time_load_h5_data = time.time() - st st = time.time() - #print(f'Loaded h5 data and processed {len(coors)} patches for {wsi_path} using {time_load_h5_data:.2f}s') os.rename(save_path+'.temp', save_path) time_pack = time.time() - st