From 21cc696d02228ce9b843f187d94a04e6d0c0c421 Mon Sep 17 00:00:00 2001 From: aycz Date: Wed, 19 Mar 2025 13:38:50 -0500 Subject: [PATCH 1/2] Implements all desired functinoality. No tests yet. --- Exercises/Exercise-3/.gitignore | 2 ++ Exercises/Exercise-3/constants.py | 5 +++ Exercises/Exercise-3/docker-compose.yml | 6 ++-- Exercises/Exercise-3/main.py | 41 +++++++++++++++++++++++-- Exercises/Exercise-3/requirements.txt | 2 +- Exercises/Exercise-3/thoughts.txt | 41 +++++++++++++++++++++++++ 6 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 Exercises/Exercise-3/.gitignore create mode 100644 Exercises/Exercise-3/constants.py create mode 100644 Exercises/Exercise-3/thoughts.txt diff --git a/Exercises/Exercise-3/.gitignore b/Exercises/Exercise-3/.gitignore new file mode 100644 index 00000000..169ecfdf --- /dev/null +++ b/Exercises/Exercise-3/.gitignore @@ -0,0 +1,2 @@ +# local environment files +*/.virtual/* diff --git a/Exercises/Exercise-3/constants.py b/Exercises/Exercise-3/constants.py new file mode 100644 index 00000000..c3198096 --- /dev/null +++ b/Exercises/Exercise-3/constants.py @@ -0,0 +1,5 @@ +''' CONSTANTS file for assigned S3 resource values. ''' + +BUCKET = 'commoncrawl' +COMMON_CRAWL_KEY = 'crawl-data/CC-MAIN-2022-05/wet.paths.gz' +NEW_FILE_NAME = 'wet.paths.gz' diff --git a/Exercises/Exercise-3/docker-compose.yml b/Exercises/Exercise-3/docker-compose.yml index c87f658b..ac057645 100644 --- a/Exercises/Exercise-3/docker-compose.yml +++ b/Exercises/Exercise-3/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.9" services: test: image: "exercise-3" @@ -6,7 +5,10 @@ services: - .:/app command: python3 -m pytest run: + environment: + - AWS_PROFILE=default image: "exercise-3" volumes: + - ~/.aws:/root/.aws:ro - .:/app - command: python3 main.py \ No newline at end of file + command: python3 main.py diff --git a/Exercises/Exercise-3/main.py b/Exercises/Exercise-3/main.py index f71c4e19..da7ebb72 100644 --- a/Exercises/Exercise-3/main.py +++ b/Exercises/Exercise-3/main.py @@ -1,9 +1,46 @@ +import gzip +import io + import boto3 +from constants import BUCKET, COMMON_CRAWL_KEY + + +def read_from_s3(client, bucket, key, num_lines=-1, print_out=False): + ''' docstring. ''' + ret_line = '' + data_file = io.BytesIO() + print('*'*75) + print(f'bucket: {bucket}, key: {key}') + print('*'*75) + client.download_fileobj(bucket, key, data_file) + data_file.seek(0) + + with gzip.open(filename=data_file, mode='rt', encoding='utf-8') as curr_file: + while num_lines: + content = curr_file.readline() + if content: + num_lines -= 1 + ret_line = content + if print_out: + print(content) + + data_file.close() + + return ret_line + def main(): - # your code here - pass + ''' docstring''' + global COMMON_CRAWL_KEY + + session = boto3.session.Session() + esssthree = session.client('s3') + + new_key = read_from_s3(esssthree, BUCKET, COMMON_CRAWL_KEY, num_lines=1) + _ = read_from_s3(client=esssthree, bucket=BUCKET, key=new_key, print_out=True) + + return if __name__ == "__main__": diff --git a/Exercises/Exercise-3/requirements.txt b/Exercises/Exercise-3/requirements.txt index 1f43333c..30ddf823 100644 --- a/Exercises/Exercise-3/requirements.txt +++ b/Exercises/Exercise-3/requirements.txt @@ -1 +1 @@ -boto3==1.21.2 \ No newline at end of file +boto3 diff --git a/Exercises/Exercise-3/thoughts.txt b/Exercises/Exercise-3/thoughts.txt new file mode 100644 index 00000000..58099cdb --- /dev/null +++ b/Exercises/Exercise-3/thoughts.txt @@ -0,0 +1,41 @@ +def main(): + ''' docstring + TODO: + check if bucket exists with client error try/except block. + stream the file through a text buffer to get the first line. + get that url and gen new stream for said file, and print out. + ''' + +def read_from_s3(client, bucket, key, num_lines=-1, print_out=False): + ''' + todo: + context wrap a file obj, and pass it into boto3 s3 client method + ``s3.download_fileobj('amzn-s3-demo-bucket', 'mykey', fileobj) + gzip buffer stream in `rt` mode, and readline(), first. + + with statement context manager did not close the file after finishing the decompressing + context. left lingering file, and is written to disk... + used alternative of opening a binary buffer and writing, then finally closing. + ''' + # make line amount as a param. + ret_line = '' + # with open('test_file', 'wb+') as data_file: + +wt: 77+45+45+25+10+20+45+160+20 + +- bucket common crawl +- path: crawl-data/CC-MAIN-2022-05/wet/paths.gz + WET stands for "WARC Encapsulated Text" + WET format is quite simple: the WARC metadata contains various details, + including the URL and the length of the plaintext data, + with the plaintext data following immediately afterwards. +- AWS services (e.g EMR) support the s3:// protocol, + and you may directly specify your input as s3://commoncrawl/path_to_file +- region: us-east-1 + +EC: + - no disk write -> context wrap the file being read in... + - can stream using a generator and next method + use tarfile module + use tarfile.open(mode='r:gz') as tf: + tf.extractfile(member) -> Extract a member from the archive as a file object. From ada384072e6c0f8d9d2c954925c0983d2d05bfe7 Mon Sep 17 00:00:00 2001 From: aycz Date: Thu, 4 Dec 2025 15:29:16 -0600 Subject: [PATCH 2/2] Adds 'Refactors for better file object handling and console outputs. --- Exercises/Exercise-3/main.py | 63 ++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/Exercises/Exercise-3/main.py b/Exercises/Exercise-3/main.py index da7ebb72..1d15b994 100644 --- a/Exercises/Exercise-3/main.py +++ b/Exercises/Exercise-3/main.py @@ -1,44 +1,53 @@ import gzip import io +from os import SEEK_SET +import logging import boto3 from constants import BUCKET, COMMON_CRAWL_KEY - -def read_from_s3(client, bucket, key, num_lines=-1, print_out=False): - ''' docstring. ''' - ret_line = '' - data_file = io.BytesIO() - print('*'*75) - print(f'bucket: {bucket}, key: {key}') - print('*'*75) - client.download_fileobj(bucket, key, data_file) - data_file.seek(0) - - with gzip.open(filename=data_file, mode='rt', encoding='utf-8') as curr_file: - while num_lines: - content = curr_file.readline() - if content: - num_lines -= 1 - ret_line = content - if print_out: - print(content) - - data_file.close() - - return ret_line +logger = logging.getLogger() + + +def read_from_s3(client, bucket, key, full_file=False): + ''' + Reads from an s3 bucket given the following params: + + client: boto3 session client, + bucket: string, + key: string, + full_file: boolean representing single line or full read + + ''' + + logger.warning(f'retreiving from bucket: {bucket} with key: {key}\n') + content = '' + + with io.BytesIO() as data_file_obj: + client.download_fileobj(bucket, key, data_file_obj) + data_file_obj.seek(SEEK_SET) # prepare pointer for read, i.e. reset stream position + + with gzip.open(filename=data_file_obj, mode='rt', encoding='utf-8') as read_file: + if full_file: + content = read_file.read() + else: + content = read_file.readline() + print(content) + return content def main(): - ''' docstring''' - global COMMON_CRAWL_KEY + ''' + We read the first line off an index wet.paths file, and subsequently read the entire file at location. + ''' session = boto3.session.Session() esssthree = session.client('s3') - new_key = read_from_s3(esssthree, BUCKET, COMMON_CRAWL_KEY, num_lines=1) - _ = read_from_s3(client=esssthree, bucket=BUCKET, key=new_key, print_out=True) + new_key = read_from_s3(client=esssthree, bucket=BUCKET, key=COMMON_CRAWL_KEY) # first file parsed is an index, only first line needed + + _ = read_from_s3(client=esssthree, bucket=BUCKET, key=new_key, full_file=True) # whole file read out return