Source code for hats.pixel_math.sparse_histogram

"""Sparse 1-D histogram of healpix pixel counts."""

import numpy as np

import hats.pixel_math.healpix_shim as hp
from hats.io import file_io


[docs] class SparseHistogram: """Wrapper around a naive sparse array, that is just non-zero indexes and counts. e.g. for a dense 1-d numpy histogram of order 0, you might see:: [0, 4, 0, 0, 0, 0, 0, 0, 9, 0, 0] There are only elements at [1, 8], and they have respective values [4, 9]. You would create the sparse histogram like:: SparseHistogram([1, 8], [4, 9], 0) """ def __init__(self, indexes, counts, order): if len(indexes) != len(counts): raise ValueError("indexes and counts must be same length")
[docs] self.indexes = indexes
[docs] self.counts = counts
[docs] self.order = order
[docs] def to_array(self): """Convert the sparse array to a dense numpy array. Returns ------- np.ndarray dense 1-d numpy array. """ dense = np.zeros(hp.order2npix(self.order), dtype=np.int64) dense[self.indexes] = self.counts return dense
[docs] def to_file(self, file_name): """Persist the sparse array to disk. NB: this saves as a sparse array, and so will likely have lower space requirements than saving the corresponding dense 1-d numpy array. Parameters ---------- file_name : path-like intended file to save to """ file_name = file_io.get_upath(file_name) with file_name.open("wb") as file_handle: np.savez(file_handle, indexes=self.indexes, counts=self.counts, order=self.order)
[docs] def to_dense_file(self, file_name): """Persist the DENSE array to disk as a numpy array. Parameters ---------- file_name : path-like intended file to save to """ file_name = file_io.get_upath(file_name) with file_name.open("wb") as file_handle: file_handle.write(self.to_array().data)
@classmethod
[docs] def from_file(cls, file_name): """Read sparse histogram from a file. Parameters ---------- file_name : path-like intended file to save read from Returns ------- SparseHistogram new sparse histogram """ file_name = file_io.get_upath(file_name) with file_name.open("rb") as file_handle: npzfile = np.load(file_handle) return cls(npzfile["indexes"], npzfile["counts"], npzfile["order"])
[docs] def __eq__(self, value): if not isinstance(value, SparseHistogram): return False return ( np.array_equal(self.indexes, value.indexes) and np.array_equal(self.counts, value.counts) and self.order == value.order )
[docs] def __str__(self): return f"Histogram at order {self.order}\n - indexes: {self.indexes}\n - values: {self.counts}"
[docs] class HistogramAggregator: """Utility for aggregating sparse histograms.""" def __init__(self, order):
[docs] self.order = order
[docs] self.full_histogram = np.zeros(hp.order2npix(order), dtype=np.int64)
[docs] def add(self, other): """Add in another sparse histogram, updating this wrapper's array. Parameters ---------- other : SparseHistogram the wrapper containing the addend """ if other is None: return if not isinstance(other, SparseHistogram): raise ValueError("Both addends should be SparseHistogram.") if self.order != other.order: raise ValueError( "The histogram partials have incompatible sizes due to different healpix orders." ) if len(other.indexes) == 0: return self.full_histogram[other.indexes] += other.counts
[docs] def to_sparse(self): """Return a SparseHistogram, based on non-zero values in this aggregation.""" indexes = self.full_histogram.nonzero()[0] counts = self.full_histogram[indexes] return SparseHistogram(indexes, counts, self.order)
[docs] def supplemental_count_histogram(mapped_pixels, supplemental_count, highest_order): """Specialized method for getting a histogram of some supplemental count, collating according to the pixels in the first argument. Typically used during import, when you wish to partition according to some supplemental data, such as in-memory size, or length of a nested column. Parameters ---------- mapped_pixels : array_like of int 1-D array of healpix pixel IDs. Values will be aggregated by pixel to produce the row-count histogram. supplemental_count : None or array_like of int Optional 1-D array of supplemental counts (for example per-row memory sizes or nested-column lengths). If ``None``, no supplemental histogram will be produced and the returned second element will be ``None``. highest_order : int Healpix order used for the histograms. Returns ------- tuple ``(row_count_histo, supplemental_count_histo)`` where both elements are :class:`SparseHistogram`. ``row_count_histo`` contains counts of rows per pixel. ``supplemental_count_histo`` contains the sum of the supplemental counts per pixel, or ``None`` if ``supplemental_count`` was ``None``. """ mapped_pixel, unique_inverse, count_at_pixel = np.unique( mapped_pixels, return_counts=True, sorted=True, return_inverse=True ) row_count_histo = SparseHistogram(mapped_pixel, count_at_pixel, highest_order) supplemental_count_histo = None if supplemental_count is not None: if len(supplemental_count) != len(mapped_pixels): raise ValueError("mapped pixels and supplemental counts must be the same length") supplemental_sums = np.zeros(len(mapped_pixel), dtype=np.int64) for index, supplemental_value in zip(unique_inverse, supplemental_count, strict=True): supplemental_sums[index] += supplemental_value supplemental_count_histo = SparseHistogram(mapped_pixel, supplemental_sums, highest_order) return (row_count_histo, supplemental_count_histo)