Source code for hipscat.catalog.dataset.dataset
from typing import Any, Dict, Tuple, Union
from typing_extensions import Self
from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo
from hipscat.io import FilePointer, file_io, paths
[docs]
class Dataset:
"""A base HiPSCat dataset that contains a catalog_info metadata file
and the data contained in parquet files"""
[docs]
CatalogInfoClass = BaseCatalogInfo
def __init__(
self,
catalog_info: CatalogInfoClass,
catalog_path=None,
storage_options: Union[Dict[Any, Any], None] = None,
) -> None:
"""Initializes a Dataset
Args:
catalog_info: A catalog_info object with the catalog metadata
catalog_path: If the catalog is stored on disk, specify the location of the catalog
Does not load the catalog from this path, only store as metadata
storage_options: dictionary that contains abstract filesystem credentials
"""
if not isinstance(catalog_info, self.CatalogInfoClass):
raise TypeError(f"catalog_info type must be {self.CatalogInfoClass}")
self.catalog_info = catalog_info
self.catalog_name = self.catalog_info.catalog_name
self.catalog_path = catalog_path
self.on_disk = catalog_path is not None
self.storage_options = storage_options
self.catalog_base_dir = file_io.get_file_pointer_from_path(self.catalog_path)
@classmethod
[docs]
def read_from_hipscat(
cls, catalog_path: str, storage_options: Union[Dict[Any, Any], None] = None
) -> Self:
"""Reads a HiPSCat Catalog from a HiPSCat directory
Args:
catalog_path: path to the root directory of the catalog
storage_options: dictionary that contains abstract filesystem credentials
Returns:
The initialized catalog object
"""
catalog_base_dir = file_io.get_file_pointer_from_path(catalog_path)
cls._check_files_exist(catalog_base_dir, storage_options=storage_options)
args = cls._read_args(catalog_base_dir, storage_options=storage_options)
kwargs = cls._read_kwargs(catalog_base_dir, storage_options=storage_options)
return cls(*args, **kwargs)
@classmethod
[docs]
def _read_args(
cls, catalog_base_dir: FilePointer, storage_options: Union[Dict[Any, Any], None] = None
) -> Tuple[CatalogInfoClass]:
catalog_info_file = paths.get_catalog_info_pointer(catalog_base_dir)
catalog_info = cls.CatalogInfoClass.read_from_metadata_file(
catalog_info_file, storage_options=storage_options
)
return (catalog_info,)
@classmethod
[docs]
def _read_kwargs(
cls, catalog_base_dir: FilePointer, storage_options: Union[Dict[Any, Any], None] = None
) -> dict:
return {"catalog_path": str(catalog_base_dir), "storage_options": storage_options}
@classmethod
[docs]
def _check_files_exist(
cls, catalog_base_dir: FilePointer, storage_options: Union[Dict[Any, Any], None] = None
):
if not file_io.does_file_or_directory_exist(catalog_base_dir, storage_options=storage_options):
raise FileNotFoundError(f"No directory exists at {str(catalog_base_dir)}")
catalog_info_file = paths.get_catalog_info_pointer(catalog_base_dir)
if not file_io.does_file_or_directory_exist(catalog_info_file, storage_options=storage_options):
raise FileNotFoundError(f"No catalog info found where expected: {str(catalog_info_file)}")