S3 Number of Objetcs, Size and Storage Class Report
Contents
Note
- This is a PoC code for looking deeper in to S3 buckets if you have many millions of objetcs under each S3 pseudo folder.
Tip
Usage info:
$ ./main.py profile region bucket-name [folder-name]
- Folder name is optional and if it’s not provided code counts all objects under s3 bucket.
Code
#!/usr/bin/env python3
import boto3
import sys
from botocore.exceptions import ClientError
# Constants for storage classes
STANDARD = 'STANDARD'
REDUCED_REDUNDANCY = 'REDUCED_REDUNDANCY'
GLACIER = 'GLACIER'
STANDARD_IA = 'STANDARD_IA'
ONEZONE_IA = 'ONEZONE_IA'
INTELLIGENT_TIERING = 'INTELLIGENT_TIERING'
DEEP_ARCHIVE = 'DEEP_ARCHIVE'
OUTPOSTS = 'OUTPOSTS'
def check_args():
if len(sys.argv) < 4:
print(f'Usage: {sys.argv[0]} profile-name region-name bucket-name [folder-name]')
exit()
elif len(sys.argv) == 5:
return sys.argv[4]
elif len(sys.argv) == 4:
return "null"
def connect_aws(profile, region, service):
try:
session = boto3.Session(profile_name=profile, region_name=region)
client = session.client(service)
return client
except ClientError as e:
print(e)
sys.exit(1)
def list_s3_folders(s3_client, bucket_name, prefix, depth=0):
paginator = s3_client.get_paginator('list_objects_v2')
response_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
for page in response_iterator:
if 'CommonPrefixes' in page:
for folder in page['CommonPrefixes']:
folder_name = folder['Prefix']
print(' ' * (depth * 4) + '|-- ' + folder_name.rstrip('/'))
folder_size, num_files, storage_class = get_folder_info(s3_client, bucket_name, folder_name)
print(' ' * ((depth + 1) * 4) + f"Size: {folder_size}, Number of files: {num_files}, Storage Class: {', '.join(storage_class)}")
list_s3_folders(s3_client, bucket_name, prefix=folder_name, depth=depth + 1)
def get_folder_info(s3_client, bucket_name, prefix):
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
total_size = 0
num_files = 0
storage_class_counts = {STANDARD: 0, REDUCED_REDUNDANCY: 0, GLACIER: 0, STANDARD_IA: 0, ONEZONE_IA: 0, INTELLIGENT_TIERING: 0, DEEP_ARCHIVE: 0, OUTPOSTS: 0}
for obj in response.get('Contents', []):
total_size += obj['Size']
num_files += 1
storage_class = obj['StorageClass']
if storage_class in storage_class_counts:
storage_class_counts[storage_class] += 1
return total_size, num_files, [f"{key}: {value}" for key, value in storage_class_counts.items() if value > 0]
def main():
folder_name = check_args()
profile = sys.argv[1]
region = sys.argv[2]
bucket_name = sys.argv[3]
s3_client = connect_aws(profile, region, 's3')
if folder_name == "null":
list_s3_folders(s3_client, bucket_name, '')
else:
list_s3_folders(s3_client, bucket_name, folder_name)
if __name__ == "__main__":
main()