Source code for hats.io.size_estimates
"""General utilities for estimating size of input and output."""
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import pyarrow as pa
from upath import UPath
from hats.io import file_io
[docs]
def estimate_dir_size(path: str | Path | UPath | None = None, *, divisor=1):
"""Estimate the disk usage of a directory, and recursive contents.
When divisor == 1, returns size in bytes."""
path = file_io.get_upath(path)
if path is None:
return 0
def _estimate_dir_size(target_dir):
total_size = 0
for item in target_dir.iterdir():
if item.is_dir():
total_size += _estimate_dir_size(item)
else:
total_size += item.stat().st_size
return total_size
est_size = _estimate_dir_size(path)
if divisor > 1:
return int(est_size / divisor)
return est_size
def _get_row_mem_size_data_frame(row):
"""Given a pandas dataframe row (as a tuple), return the memory size of that row.
Args:
row (tuple): the row from the dataframe
Returns:
int: the memory size of the row in bytes
"""
total = 0
# Add the memory overhead of the row object itself.
total += sys.getsizeof(row)
# Then add the size of each item in the row.
for item in row:
if isinstance(item, np.ndarray):
total += item.nbytes + sys.getsizeof(item) # object data + object overhead
else:
total += sys.getsizeof(item)
return total
def _get_row_mem_size_pa_table(table, row_index):
"""Given a pyarrow table and a row index, return the memory size of that row.
Args:
table (pa.Table): the pyarrow table
row_index (int): the index of the row to measure
Returns:
int: the memory size of the row in bytes
"""
total = 0
# Add the memory overhead of the row object itself.
total += sys.getsizeof(row_index)
# Then add the size of each item in the row.
for column in table.itercolumns():
item = column[row_index]
if isinstance(item, np.ndarray):
total += item.nbytes + sys.getsizeof(item) # object data + object overhead
else:
total += sys.getsizeof(item.as_py())
return total
[docs]
def get_mem_size_per_row(data):
"""Given a 2D array of data, return a list of memory sizes for each row in the chunk.
Args:
data (pd.DataFrame or pa.Table): the data chunk to measure
Returns:
list[int]: list of memory sizes for each row in the chunk
"""
if isinstance(data, pd.DataFrame):
mem_sizes = [_get_row_mem_size_data_frame(row) for row in data.itertuples(index=False, name=None)]
elif isinstance(data, pa.Table):
mem_sizes = [_get_row_mem_size_pa_table(data, i) for i in range(data.num_rows)]
else:
raise NotImplementedError(f"Unsupported data type {type(data)} for memory size calculation")
return mem_sizes