-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy paths3_mongo_helper.py
65 lines (54 loc) · 2.37 KB
/
s3_mongo_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import boto3
from boto3 import client
import pymongo
from pymongo import MongoClient
from dotenv import load_dotenv
load_dotenv()
import sys
# Helper functions to upload to Mongo and S3
def initialize_s3():
aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY_ID")
aws = os.environ.get("AWS_BASE_URL")
bucket = os.environ.get("AWS_BUCKET")
s3 = boto3.client("s3", aws_access_key_id = aws_access_key_id,
aws_secret_access_key= aws_secret_access_key)
return aws, bucket, s3
def initialize_mongo():
mongo_url = "mongodb+srv://"+os.environ.get("SHARECHAT_DB_USERNAME")+":"+os.environ.get("SHARECHAT_DB_PASSWORD")+"@tattle-data-fkpmg.mongodb.net/test?retryWrites=true&w=majority&ssl=true&ssl_cert_reqs=CERT_NONE"
cli = MongoClient(mongo_url)
db = cli[os.environ.get("SHARECHAT_DB_NAME")]
coll = db[os.environ.get("SHARECHAT_DB_COLLECTION")]
if coll.count_documents({}) > 0:
return coll
else:
print("Error accessing Mongo collection")
sys.exit()
def upload_to_s3(s3, file, filename, bucket, content_type):
with open(file, "rb")as data:
s3.upload_fileobj(Fileobj = data,
Bucket = bucket,
Key = filename,
ExtraArgs={'ContentType': content_type,
'ACL': 'public-read'})
def upload_to_mongo(data, coll):
coll.insert_one(data)
def count_s3_files(s3, bucket):
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket)
file_count = 0
for page in pages:
for obj in page['Contents']:
file_count += 1
return file_count
# Get files uploaded to S3 in a date range
# Date format - datetime.datetime(YYYY, M, D, %h, %m, %s, tzinfo=tzutc()
def filter_s3_files(s3, bucket, start_date, end_date):
objects = s3.list_objects_v2(Bucket=bucket)
files = [{'Key': o['Key']} for o in objects['Contents'] if end_date > o['LastModified'] > start_date]
return files
def delete_s3_files(s3, bucket, start_date, end_date):
keys_to_delete = filter_s3_files(s3, bucket, start_date, end_date)
s3.delete_objects(Bucket=bucket, Delete={'Objects': keys_to_delete})
print("{} S3 files deleted".format(len(keys_to_delete)))