Contents

S3 Number of Objetcs, Size and Storage Class Report

Contents
Note
  • This is a PoC code for looking deeper in to S3 buckets if you have many millions of objetcs under each S3 pseudo folder.
Tip

Usage info:

$ ./main.py profile region bucket-name [folder-name]

  • Folder name is optional and if it’s not provided code counts all objects under s3 bucket.

Code

#!/usr/bin/env python3
import boto3
import sys
from botocore.exceptions import ClientError

# Constants for storage classes
STANDARD = 'STANDARD'
REDUCED_REDUNDANCY = 'REDUCED_REDUNDANCY'
GLACIER = 'GLACIER'
STANDARD_IA = 'STANDARD_IA'
ONEZONE_IA = 'ONEZONE_IA'
INTELLIGENT_TIERING = 'INTELLIGENT_TIERING'
DEEP_ARCHIVE = 'DEEP_ARCHIVE'
OUTPOSTS = 'OUTPOSTS'

def check_args():
    if len(sys.argv) < 4:
        print(f'Usage: {sys.argv[0]} profile-name region-name bucket-name [folder-name]')
        exit()
    elif len(sys.argv) == 5:
        return sys.argv[4]
    elif len(sys.argv) == 4:
        return "null"

def connect_aws(profile, region, service):
    try:
        session = boto3.Session(profile_name=profile, region_name=region)
        client = session.client(service)
        return client
    except ClientError as e:
        print(e)
        sys.exit(1)

def list_s3_folders(s3_client, bucket_name, prefix, depth=0):
    paginator = s3_client.get_paginator('list_objects_v2')
    response_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/')

    for page in response_iterator:
        if 'CommonPrefixes' in page:
            for folder in page['CommonPrefixes']:
                folder_name = folder['Prefix']
                print(' ' * (depth * 4) + '|-- ' + folder_name.rstrip('/'))
                folder_size, num_files, storage_class = get_folder_info(s3_client, bucket_name, folder_name)
                print(' ' * ((depth + 1) * 4) + f"Size: {folder_size}, Number of files: {num_files}, Storage Class: {', '.join(storage_class)}")
                list_s3_folders(s3_client, bucket_name, prefix=folder_name, depth=depth + 1)

def get_folder_info(s3_client, bucket_name, prefix):
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    total_size = 0
    num_files = 0
    storage_class_counts = {STANDARD: 0, REDUCED_REDUNDANCY: 0, GLACIER: 0, STANDARD_IA: 0, ONEZONE_IA: 0, INTELLIGENT_TIERING: 0, DEEP_ARCHIVE: 0, OUTPOSTS: 0}

    for obj in response.get('Contents', []):
        total_size += obj['Size']
        num_files += 1
        storage_class = obj['StorageClass']
        if storage_class in storage_class_counts:
            storage_class_counts[storage_class] += 1

    return total_size, num_files, [f"{key}: {value}" for key, value in storage_class_counts.items() if value > 0]

def main():
    folder_name = check_args()
    profile = sys.argv[1]
    region = sys.argv[2]
    bucket_name = sys.argv[3]
    s3_client = connect_aws(profile, region, 's3')
    if folder_name == "null": 
        list_s3_folders(s3_client, bucket_name, '')
    else:
        list_s3_folders(s3_client, bucket_name, folder_name)

if __name__ == "__main__":
    main()