Contents

List AWS S3 objects using boto3 with pagination

Contents
Note
  • Counting number of objects under an S3 folder is tricky because AWS console metrics provide the object count for the whole S3 bucket.

  • “aws s3 ls” command is really slow and if you have objects around 10 millions it will take a lot of time.

  • This is a quick PoC code for counting objects under an s3 folder using pagination and hopefully faster new “list_objects_v2” API call.

Tip

Usage info:

$ ./main.py profile region bucket-name [folder-name]

  • Folder name is optional and if it’s not provided code counts all objects under s3 bucket.

Code

#!/usr/bin/env python3

import boto3
import sys
from botocore.exceptions import ClientError

def check_args():
    if len(sys.argv) < 4:
        print(f'Usage: {sys.argv[0]} profile-name region-name bucket-name [folder-name]')
        exit()
    elif len(sys.argv) == 5:
        return sys.argv[4]
    elif len(sys.argv) == 4:
        return 0

def connect_aws(vProfile, vRegion, vService):
    try:
        boto3.setup_default_session(
            profile_name=vProfile, region_name=vRegion)
        worker = boto3.client(vService)
        return worker
    except ClientError as e:
        print(e)

#
# MAIN STARTS HERE
#
if __name__ == '__main__':
    # Check number of arguments
    gFolderName = check_args()
    # Set vars
    gProfile = sys.argv[1]
    gRegion = sys.argv[2]
    gBucketName = sys.argv[3]
    gCount = 0
    gSubCount = 0
    try:
        # Connect to AWS
        worker_s3 = connect_aws(gProfile, gRegion, 's3')
    except ClientError as e:
        print(e)
    
    # List object under s3 bucket or folder
    paginator = worker_s3.get_paginator("list_objects_v2")
    if gFolderName == 0:
        response = paginator.paginate(Bucket=gBucketName)
    else:
        response = paginator.paginate(Bucket=gBucketName, Prefix=gFolderName)
    for page in response:
        print("Listing files")
        files = page.get("Contents")
        for file in files:
            gCount += 1
            gSubCount += 1
            if gSubCount == 1000000:
                print(f"Total number of object count so far : {gCount}")
                gSubCount = 0
            #print(f"File: {file['Key']}, Size: {file['Size']}, StorageClass: {file['StorageClass']} ")
        print(f"Total Number of objects: {gCount}")