diff --git a/Exercises/Exercise-3/.gitignore b/Exercises/Exercise-3/.gitignore new file mode 100644 index 00000000..169ecfdf --- /dev/null +++ b/Exercises/Exercise-3/.gitignore @@ -0,0 +1,2 @@ +# local environment files +*/.virtual/* diff --git a/Exercises/Exercise-3/constants.py b/Exercises/Exercise-3/constants.py new file mode 100644 index 00000000..c3198096 --- /dev/null +++ b/Exercises/Exercise-3/constants.py @@ -0,0 +1,5 @@ +''' CONSTANTS file for assigned S3 resource values. ''' + +BUCKET = 'commoncrawl' +COMMON_CRAWL_KEY = 'crawl-data/CC-MAIN-2022-05/wet.paths.gz' +NEW_FILE_NAME = 'wet.paths.gz' diff --git a/Exercises/Exercise-3/docker-compose.yml b/Exercises/Exercise-3/docker-compose.yml index c87f658b..ac057645 100644 --- a/Exercises/Exercise-3/docker-compose.yml +++ b/Exercises/Exercise-3/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.9" services: test: image: "exercise-3" @@ -6,7 +5,10 @@ services: - .:/app command: python3 -m pytest run: + environment: + - AWS_PROFILE=default image: "exercise-3" volumes: + - ~/.aws:/root/.aws:ro - .:/app - command: python3 main.py \ No newline at end of file + command: python3 main.py diff --git a/Exercises/Exercise-3/main.py b/Exercises/Exercise-3/main.py index f71c4e19..1d15b994 100644 --- a/Exercises/Exercise-3/main.py +++ b/Exercises/Exercise-3/main.py @@ -1,9 +1,55 @@ +import gzip +import io +from os import SEEK_SET +import logging + import boto3 +from constants import BUCKET, COMMON_CRAWL_KEY + +logger = logging.getLogger() + + +def read_from_s3(client, bucket, key, full_file=False): + ''' + Reads from an s3 bucket given the following params: + + client: boto3 session client, + bucket: string, + key: string, + full_file: boolean representing single line or full read + + ''' + + logger.warning(f'retreiving from bucket: {bucket} with key: {key}\n') + content = '' + + with io.BytesIO() as data_file_obj: + client.download_fileobj(bucket, key, data_file_obj) + data_file_obj.seek(SEEK_SET) # prepare pointer for read, i.e. reset stream position + + with gzip.open(filename=data_file_obj, mode='rt', encoding='utf-8') as read_file: + if full_file: + content = read_file.read() + else: + content = read_file.readline() + print(content) + return content + def main(): - # your code here - pass + ''' + We read the first line off an index wet.paths file, and subsequently read the entire file at location. + ''' + + session = boto3.session.Session() + esssthree = session.client('s3') + + new_key = read_from_s3(client=esssthree, bucket=BUCKET, key=COMMON_CRAWL_KEY) # first file parsed is an index, only first line needed + + _ = read_from_s3(client=esssthree, bucket=BUCKET, key=new_key, full_file=True) # whole file read out + + return if __name__ == "__main__": diff --git a/Exercises/Exercise-3/requirements.txt b/Exercises/Exercise-3/requirements.txt index 1f43333c..30ddf823 100644 --- a/Exercises/Exercise-3/requirements.txt +++ b/Exercises/Exercise-3/requirements.txt @@ -1 +1 @@ -boto3==1.21.2 \ No newline at end of file +boto3 diff --git a/Exercises/Exercise-3/thoughts.txt b/Exercises/Exercise-3/thoughts.txt new file mode 100644 index 00000000..58099cdb --- /dev/null +++ b/Exercises/Exercise-3/thoughts.txt @@ -0,0 +1,41 @@ +def main(): + ''' docstring + TODO: + check if bucket exists with client error try/except block. + stream the file through a text buffer to get the first line. + get that url and gen new stream for said file, and print out. + ''' + +def read_from_s3(client, bucket, key, num_lines=-1, print_out=False): + ''' + todo: + context wrap a file obj, and pass it into boto3 s3 client method + ``s3.download_fileobj('amzn-s3-demo-bucket', 'mykey', fileobj) + gzip buffer stream in `rt` mode, and readline(), first. + + with statement context manager did not close the file after finishing the decompressing + context. left lingering file, and is written to disk... + used alternative of opening a binary buffer and writing, then finally closing. + ''' + # make line amount as a param. + ret_line = '' + # with open('test_file', 'wb+') as data_file: + +wt: 77+45+45+25+10+20+45+160+20 + +- bucket common crawl +- path: crawl-data/CC-MAIN-2022-05/wet/paths.gz + WET stands for "WARC Encapsulated Text" + WET format is quite simple: the WARC metadata contains various details, + including the URL and the length of the plaintext data, + with the plaintext data following immediately afterwards. +- AWS services (e.g EMR) support the s3:// protocol, + and you may directly specify your input as s3://commoncrawl/path_to_file +- region: us-east-1 + +EC: + - no disk write -> context wrap the file being read in... + - can stream using a generator and next method + use tarfile module + use tarfile.open(mode='r:gz') as tf: + tf.extractfile(member) -> Extract a member from the archive as a file object.