From 21cc696d02228ce9b843f187d94a04e6d0c0c421 Mon Sep 17 00:00:00 2001
From: aycz <alexycrz@gmail.com>
Date: Wed, 19 Mar 2025 13:38:50 -0500
Subject: [PATCH 1/2] Implements all desired functinoality. No tests yet.

---
 Exercises/Exercise-3/.gitignore         |  2 ++
 Exercises/Exercise-3/constants.py       |  5 +++
 Exercises/Exercise-3/docker-compose.yml |  6 ++--
 Exercises/Exercise-3/main.py            | 41 +++++++++++++++++++++++--
 Exercises/Exercise-3/requirements.txt   |  2 +-
 Exercises/Exercise-3/thoughts.txt       | 41 +++++++++++++++++++++++++
 6 files changed, 92 insertions(+), 5 deletions(-)
 create mode 100644 Exercises/Exercise-3/.gitignore
 create mode 100644 Exercises/Exercise-3/constants.py
 create mode 100644 Exercises/Exercise-3/thoughts.txt

diff --git a/Exercises/Exercise-3/.gitignore b/Exercises/Exercise-3/.gitignore
new file mode 100644
index 00000000..169ecfdf
--- /dev/null
+++ b/Exercises/Exercise-3/.gitignore
@@ -0,0 +1,2 @@
+# local environment files
+*/.virtual/*
diff --git a/Exercises/Exercise-3/constants.py b/Exercises/Exercise-3/constants.py
new file mode 100644
index 00000000..c3198096
--- /dev/null
+++ b/Exercises/Exercise-3/constants.py
@@ -0,0 +1,5 @@
+''' CONSTANTS file for assigned S3 resource values. '''
+
+BUCKET = 'commoncrawl'
+COMMON_CRAWL_KEY = 'crawl-data/CC-MAIN-2022-05/wet.paths.gz'
+NEW_FILE_NAME = 'wet.paths.gz'
diff --git a/Exercises/Exercise-3/docker-compose.yml b/Exercises/Exercise-3/docker-compose.yml
index c87f658b..ac057645 100644
--- a/Exercises/Exercise-3/docker-compose.yml
+++ b/Exercises/Exercise-3/docker-compose.yml
@@ -1,4 +1,3 @@
-version: "3.9"
 services:
   test:
     image: "exercise-3"
@@ -6,7 +5,10 @@ services:
       - .:/app
     command: python3 -m pytest
   run:
+    environment:
+      - AWS_PROFILE=default
     image: "exercise-3"
     volumes:
+      - ~/.aws:/root/.aws:ro
       - .:/app
-    command: python3 main.py
\ No newline at end of file
+    command: python3 main.py
diff --git a/Exercises/Exercise-3/main.py b/Exercises/Exercise-3/main.py
index f71c4e19..da7ebb72 100644
--- a/Exercises/Exercise-3/main.py
+++ b/Exercises/Exercise-3/main.py
@@ -1,9 +1,46 @@
+import gzip
+import io
+
 import boto3
 
+from constants import BUCKET, COMMON_CRAWL_KEY
+
+
+def read_from_s3(client, bucket, key, num_lines=-1, print_out=False):
+    ''' docstring. '''
+    ret_line = ''
+    data_file = io.BytesIO()
+    print('*'*75)
+    print(f'bucket: {bucket}, key: {key}')
+    print('*'*75)
+    client.download_fileobj(bucket, key, data_file)
+    data_file.seek(0)
+
+    with gzip.open(filename=data_file, mode='rt', encoding='utf-8') as curr_file:
+        while num_lines:
+            content = curr_file.readline()
+            if content:
+                num_lines -= 1
+                ret_line = content
+                if print_out:
+                    print(content)
+
+    data_file.close()
+
+    return ret_line
+
 
 def main():
-    # your code here
-    pass
+    ''' docstring'''
+    global COMMON_CRAWL_KEY
+
+    session = boto3.session.Session()
+    esssthree = session.client('s3')
+
+    new_key = read_from_s3(esssthree, BUCKET, COMMON_CRAWL_KEY, num_lines=1)
+    _ = read_from_s3(client=esssthree, bucket=BUCKET, key=new_key, print_out=True)
+
+    return
 
 
 if __name__ == "__main__":
diff --git a/Exercises/Exercise-3/requirements.txt b/Exercises/Exercise-3/requirements.txt
index 1f43333c..30ddf823 100644
--- a/Exercises/Exercise-3/requirements.txt
+++ b/Exercises/Exercise-3/requirements.txt
@@ -1 +1 @@
-boto3==1.21.2
\ No newline at end of file
+boto3
diff --git a/Exercises/Exercise-3/thoughts.txt b/Exercises/Exercise-3/thoughts.txt
new file mode 100644
index 00000000..58099cdb
--- /dev/null
+++ b/Exercises/Exercise-3/thoughts.txt
@@ -0,0 +1,41 @@
+def main():
+    ''' docstring
+    TODO:
+        check if bucket exists with client error try/except block.
+        stream the file through a text buffer to get the first line.
+        get that url and gen new stream for said file, and print out.
+    '''
+
+def read_from_s3(client, bucket, key, num_lines=-1, print_out=False):
+    '''
+    todo:
+        context wrap a file obj, and pass it into boto3 s3 client method
+        ``s3.download_fileobj('amzn-s3-demo-bucket', 'mykey', fileobj)
+        gzip buffer stream in `rt` mode, and readline(), first.
+
+    with statement context manager did not close the file after finishing the decompressing
+        context. left lingering file, and is written to disk...
+        used alternative of opening a binary buffer and writing, then finally closing.
+    '''
+    # make line amount as a param.
+    ret_line = ''
+    # with open('test_file', 'wb+') as data_file:
+
+wt: 77+45+45+25+10+20+45+160+20
+
+- bucket common crawl
+- path: crawl-data/CC-MAIN-2022-05/wet/paths.gz
+    WET stands for "WARC Encapsulated Text"
+    WET format is quite simple: the WARC metadata contains various details,
+    including the URL and the length of the plaintext data,
+    with the plaintext data following immediately afterwards.
+- AWS services (e.g EMR) support the s3:// protocol,
+    and you may directly specify your input as s3://commoncrawl/path_to_file
+- region: us-east-1
+
+EC:
+    - no disk write -> context wrap the file being read in...
+    - can stream using a generator and next method
+    use tarfile module
+        use tarfile.open(mode='r:gz') as tf:
+            tf.extractfile(member) -> Extract a member from the archive as a file object.

From ada384072e6c0f8d9d2c954925c0983d2d05bfe7 Mon Sep 17 00:00:00 2001
From: aycz <alexycrz@gmail.com>
Date: Thu, 4 Dec 2025 15:29:16 -0600
Subject: [PATCH 2/2] Adds 'Refactors for better file object handling and
 console outputs.

---
 Exercises/Exercise-3/main.py | 63 ++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/Exercises/Exercise-3/main.py b/Exercises/Exercise-3/main.py
index da7ebb72..1d15b994 100644
--- a/Exercises/Exercise-3/main.py
+++ b/Exercises/Exercise-3/main.py
@@ -1,44 +1,53 @@
 import gzip
 import io
+from os import SEEK_SET
+import logging
 
 import boto3
 
 from constants import BUCKET, COMMON_CRAWL_KEY
 
-
-def read_from_s3(client, bucket, key, num_lines=-1, print_out=False):
-    ''' docstring. '''
-    ret_line = ''
-    data_file = io.BytesIO()
-    print('*'*75)
-    print(f'bucket: {bucket}, key: {key}')
-    print('*'*75)
-    client.download_fileobj(bucket, key, data_file)
-    data_file.seek(0)
-
-    with gzip.open(filename=data_file, mode='rt', encoding='utf-8') as curr_file:
-        while num_lines:
-            content = curr_file.readline()
-            if content:
-                num_lines -= 1
-                ret_line = content
-                if print_out:
-                    print(content)
-
-    data_file.close()
-
-    return ret_line
+logger = logging.getLogger()
+
+
+def read_from_s3(client, bucket, key, full_file=False):
+    '''
+        Reads from an s3 bucket given the following params:
+        
+        client: boto3 session client,
+        bucket: string,
+        key: string,
+        full_file: boolean representing single line or full read
+        
+    '''
+
+    logger.warning(f'retreiving from bucket: {bucket} with key: {key}\n')
+    content = ''
+    
+    with io.BytesIO() as data_file_obj:
+        client.download_fileobj(bucket, key, data_file_obj)
+        data_file_obj.seek(SEEK_SET)  # prepare pointer for read, i.e. reset stream position
+        
+        with gzip.open(filename=data_file_obj, mode='rt', encoding='utf-8') as read_file:
+            if full_file:
+                content = read_file.read()
+            else:
+                content = read_file.readline()
+            print(content)
+    return content
 
 
 def main():
-    ''' docstring'''
-    global COMMON_CRAWL_KEY
+    '''
+    We read the first line off an index wet.paths file, and subsequently read the entire file at location.
+    '''
 
     session = boto3.session.Session()
     esssthree = session.client('s3')
 
-    new_key = read_from_s3(esssthree, BUCKET, COMMON_CRAWL_KEY, num_lines=1)
-    _ = read_from_s3(client=esssthree, bucket=BUCKET, key=new_key, print_out=True)
+    new_key = read_from_s3(client=esssthree, bucket=BUCKET, key=COMMON_CRAWL_KEY)  # first file parsed is an index, only first line needed
+    
+    _ = read_from_s3(client=esssthree, bucket=BUCKET, key=new_key, full_file=True)  # whole file read out
 
     return