Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Exercises/Exercise-3/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# local environment files
*/.virtual/*
5 changes: 5 additions & 0 deletions Exercises/Exercise-3/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
''' CONSTANTS file for assigned S3 resource values. '''

BUCKET = 'commoncrawl'
COMMON_CRAWL_KEY = 'crawl-data/CC-MAIN-2022-05/wet.paths.gz'
NEW_FILE_NAME = 'wet.paths.gz'
6 changes: 4 additions & 2 deletions Exercises/Exercise-3/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
version: "3.9"
services:
test:
image: "exercise-3"
volumes:
- .:/app
command: python3 -m pytest
run:
environment:
- AWS_PROFILE=default
image: "exercise-3"
volumes:
- ~/.aws:/root/.aws:ro
- .:/app
command: python3 main.py
command: python3 main.py
50 changes: 48 additions & 2 deletions Exercises/Exercise-3/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,55 @@
import gzip
import io
from os import SEEK_SET
import logging

import boto3

from constants import BUCKET, COMMON_CRAWL_KEY

logger = logging.getLogger()


def read_from_s3(client, bucket, key, full_file=False):
'''
Reads from an s3 bucket given the following params:

client: boto3 session client,
bucket: string,
key: string,
full_file: boolean representing single line or full read

'''

logger.warning(f'retreiving from bucket: {bucket} with key: {key}\n')
content = ''

with io.BytesIO() as data_file_obj:
client.download_fileobj(bucket, key, data_file_obj)
data_file_obj.seek(SEEK_SET) # prepare pointer for read, i.e. reset stream position

with gzip.open(filename=data_file_obj, mode='rt', encoding='utf-8') as read_file:
if full_file:
content = read_file.read()
else:
content = read_file.readline()
print(content)
return content


def main():
# your code here
pass
'''
We read the first line off an index wet.paths file, and subsequently read the entire file at location.
'''

session = boto3.session.Session()
esssthree = session.client('s3')

new_key = read_from_s3(client=esssthree, bucket=BUCKET, key=COMMON_CRAWL_KEY) # first file parsed is an index, only first line needed

_ = read_from_s3(client=esssthree, bucket=BUCKET, key=new_key, full_file=True) # whole file read out

return


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion Exercises/Exercise-3/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
boto3==1.21.2
boto3
41 changes: 41 additions & 0 deletions Exercises/Exercise-3/thoughts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
def main():
''' docstring
TODO:
check if bucket exists with client error try/except block.
stream the file through a text buffer to get the first line.
get that url and gen new stream for said file, and print out.
'''

def read_from_s3(client, bucket, key, num_lines=-1, print_out=False):
'''
todo:
context wrap a file obj, and pass it into boto3 s3 client method
``s3.download_fileobj('amzn-s3-demo-bucket', 'mykey', fileobj)
gzip buffer stream in `rt` mode, and readline(), first.

with statement context manager did not close the file after finishing the decompressing
context. left lingering file, and is written to disk...
used alternative of opening a binary buffer and writing, then finally closing.
'''
# make line amount as a param.
ret_line = ''
# with open('test_file', 'wb+') as data_file:

wt: 77+45+45+25+10+20+45+160+20

- bucket common crawl
- path: crawl-data/CC-MAIN-2022-05/wet/paths.gz
WET stands for "WARC Encapsulated Text"
WET format is quite simple: the WARC metadata contains various details,
including the URL and the length of the plaintext data,
with the plaintext data following immediately afterwards.
- AWS services (e.g EMR) support the s3:// protocol,
and you may directly specify your input as s3://commoncrawl/path_to_file
- region: us-east-1

EC:
- no disk write -> context wrap the file being read in...
- can stream using a generator and next method
use tarfile module
use tarfile.open(mode='r:gz') as tf:
tf.extractfile(member) -> Extract a member from the archive as a file object.