AlexyCz · AlexyCz · Mar 19, 2025 · Dec 4, 2025
diff --git a/Exercises/Exercise-3/.gitignore b/Exercises/Exercise-3/.gitignore
@@ -0,0 +1,2 @@
+# local environment files
+*/.virtual/*
diff --git a/Exercises/Exercise-3/constants.py b/Exercises/Exercise-3/constants.py
@@ -0,0 +1,5 @@
+''' CONSTANTS file for assigned S3 resource values. '''
+
+BUCKET = 'commoncrawl'
+COMMON_CRAWL_KEY = 'crawl-data/CC-MAIN-2022-05/wet.paths.gz'
+NEW_FILE_NAME = 'wet.paths.gz'
diff --git a/Exercises/Exercise-3/docker-compose.yml b/Exercises/Exercise-3/docker-compose.yml
@@ -1,12 +1,14 @@
-version: "3.9"
 services:
   test:
     image: "exercise-3"
     volumes:
       - .:/app
     command: python3 -m pytest
   run:
+    environment:
+      - AWS_PROFILE=default
     image: "exercise-3"
     volumes:
+      - ~/.aws:/root/.aws:ro
       - .:/app
-    command: python3 main.py
+    command: python3 main.py
diff --git a/Exercises/Exercise-3/main.py b/Exercises/Exercise-3/main.py
@@ -1,9 +1,55 @@
+import gzip
+import io
+from os import SEEK_SET
+import logging
+
 import boto3
 
+from constants import BUCKET, COMMON_CRAWL_KEY
+
+logger = logging.getLogger()
+
+
+def read_from_s3(client, bucket, key, full_file=False):
+    '''
+        Reads from an s3 bucket given the following params:
+
+        client: boto3 session client,
+        bucket: string,
+        key: string,
+        full_file: boolean representing single line or full read
+
+    '''
+
+    logger.warning(f'retreiving from bucket: {bucket} with key: {key}\n')
+    content = ''
+
+    with io.BytesIO() as data_file_obj:
+        client.download_fileobj(bucket, key, data_file_obj)
+        data_file_obj.seek(SEEK_SET)  # prepare pointer for read, i.e. reset stream position
+
+        with gzip.open(filename=data_file_obj, mode='rt', encoding='utf-8') as read_file:
+            if full_file:
+                content = read_file.read()
+            else:
+                content = read_file.readline()
+            print(content)
+    return content
+
 
 def main():
-    # your code here
-    pass
+    '''
+    We read the first line off an index wet.paths file, and subsequently read the entire file at location.
+    '''
+
+    session = boto3.session.Session()
+    esssthree = session.client('s3')
+
+    new_key = read_from_s3(client=esssthree, bucket=BUCKET, key=COMMON_CRAWL_KEY)  # first file parsed is an index, only first line needed
+
+    _ = read_from_s3(client=esssthree, bucket=BUCKET, key=new_key, full_file=True)  # whole file read out
+
+    return
 
 
 if __name__ == "__main__":

diff --git a/Exercises/Exercise-3/requirements.txt b/Exercises/Exercise-3/requirements.txt
@@ -1 +1 @@
-boto3==1.21.2
+boto3
diff --git a/Exercises/Exercise-3/thoughts.txt b/Exercises/Exercise-3/thoughts.txt
@@ -0,0 +1,41 @@
+def main():
+    ''' docstring
+    TODO:
+        check if bucket exists with client error try/except block.
+        stream the file through a text buffer to get the first line.
+        get that url and gen new stream for said file, and print out.
+    '''
+
+def read_from_s3(client, bucket, key, num_lines=-1, print_out=False):
+    '''
+    todo:
+        context wrap a file obj, and pass it into boto3 s3 client method
+        ``s3.download_fileobj('amzn-s3-demo-bucket', 'mykey', fileobj)
+        gzip buffer stream in `rt` mode, and readline(), first.
+
+    with statement context manager did not close the file after finishing the decompressing
+        context. left lingering file, and is written to disk...
+        used alternative of opening a binary buffer and writing, then finally closing.
+    '''
+    # make line amount as a param.
+    ret_line = ''
+    # with open('test_file', 'wb+') as data_file:
+
+wt: 77+45+45+25+10+20+45+160+20
+
+- bucket common crawl
+- path: crawl-data/CC-MAIN-2022-05/wet/paths.gz
+    WET stands for "WARC Encapsulated Text"
+    WET format is quite simple: the WARC metadata contains various details,
+    including the URL and the length of the plaintext data,
+    with the plaintext data following immediately afterwards.
+- AWS services (e.g EMR) support the s3:// protocol,
+    and you may directly specify your input as s3://commoncrawl/path_to_file
+- region: us-east-1
+
+EC:
+    - no disk write -> context wrap the file being read in...
+    - can stream using a generator and next method
+    use tarfile module
+        use tarfile.open(mode='r:gz') as tf:
+            tf.extractfile(member) -> Extract a member from the archive as a file object.