List AWS S3 objects using boto3 with pagination
Contents
Note
-
Counting number of objects under an S3 folder is tricky because AWS console metrics provide the object count for the whole S3 bucket.
-
“aws s3 ls” command is really slow and if you have objects around 10 millions it will take a lot of time.
-
This is a quick PoC code for counting objects under an s3 folder using pagination and hopefully faster new “list_objects_v2” API call.
Tip
Usage info:
$ ./main.py profile region bucket-name [folder-name]
- Folder name is optional and if it’s not provided code counts all objects under s3 bucket.
Code
#!/usr/bin/env python3
import boto3
import sys
from botocore.exceptions import ClientError
def check_args():
if len(sys.argv) < 4:
print(f'Usage: {sys.argv[0]} profile-name region-name bucket-name [folder-name]')
exit()
elif len(sys.argv) == 5:
return sys.argv[4]
elif len(sys.argv) == 4:
return 0
def connect_aws(vProfile, vRegion, vService):
try:
boto3.setup_default_session(
profile_name=vProfile, region_name=vRegion)
worker = boto3.client(vService)
return worker
except ClientError as e:
print(e)
#
# MAIN STARTS HERE
#
if __name__ == '__main__':
# Check number of arguments
gFolderName = check_args()
# Set vars
gProfile = sys.argv[1]
gRegion = sys.argv[2]
gBucketName = sys.argv[3]
gCount = 0
gSubCount = 0
try:
# Connect to AWS
worker_s3 = connect_aws(gProfile, gRegion, 's3')
except ClientError as e:
print(e)
# List object under s3 bucket or folder
paginator = worker_s3.get_paginator("list_objects_v2")
if gFolderName == 0:
response = paginator.paginate(Bucket=gBucketName)
else:
response = paginator.paginate(Bucket=gBucketName, Prefix=gFolderName)
for page in response:
print("Listing files")
files = page.get("Contents")
for file in files:
gCount += 1
gSubCount += 1
if gSubCount == 1000000:
print(f"Total number of object count so far : {gCount}")
gSubCount = 0
#print(f"File: {file['Key']}, Size: {file['Size']}, StorageClass: {file['StorageClass']} ")
print(f"Total Number of objects: {gCount}")