Source code for spacekit.datasets.beam

"""
Retrieve dataset archive (.zip) files from the web, an s3 bucket, or local disk.
This script is primarily intended as a retrieval and extraction step before launching spacekit.dashboard
For more customized control of dataset retrieval (such as your own), use the spacekit.extractor.scrape module.

Examples:
"datasets": if set, chooses specific archive dataset filenames to retrieve and extract

src: "web" -> Fetch and extract from one of the spacekit data archives:
archive: name of collection (see ``spacekit.datasets.meta``)
download(scrape="web:calcloud")
download(scrape="web:svm")

src: "file" -> Fetch and extract from path on local disk
archive: path
download(scrape="file:another/path/to/data)

src: "s3" -> Fetch and extract from an S3 bucket on AWS
archive: bucketname
"datasets" -> string of S3 archive file basenames separated by comma (without the .zip or .tgz suffix)
download(scrape="s3:mybucketname", "2021-11-04-1636048291,2021-10-28-1635457222,2021-08-22-1629663047")

archive: json filepath containing metadata structured similar to dictionaries in ``spacekit.datasets.meta``
"""
import argparse
import sys
import os
import json
from spacekit.datasets.meta import spacekit_collections
from spacekit.extractor.scrape import WebScraper, S3Scraper, FileScraper
from spacekit.logger.log import SPACEKIT_LOG

S3PREFIX = os.environ.get("PFX", "archive")


[docs]def download(scrape="file:data", datasets="2022-02-14,2021-11-04,2021-10-28", dest="."):
    if len(dest.split("/")) > 1:
        cache_dir = os.path.abspath(dest.split("/")[0])
        cache_subdir = "/".join(dest.split("/")[1:])
    else:
        cache_dir, cache_subdir = dest, "data"
    src, archive = scrape.split(":")
    datasets = datasets.split(",")
    if src == "web":
        if archive.split(".")[-1] == "json":
            SPACEKIT_LOG.info("Scraping web via custom json file")
            with open(archive, "r") as j:
                collection = json.load(j)
                scraper = WebScraper(
                    collection["uri"],
                    collection["data"],
                    cache_dir=cache_dir,
                    cache_subdir=cache_subdir,
                )
        elif archive in spacekit_collections.keys():
            SPACEKIT_LOG.info(f"Scraping spacekit collection {archive.upper()}")
            cc = spacekit_collections[archive]  # "calcloud", "svm"
            dd = {}
            for d in datasets:
                dd[d] = cc["data"][d]
            scraper = WebScraper(
                cc["uri"], dd, cache_dir=cache_dir, cache_subdir=cache_subdir
            )
        else:
            SPACEKIT_LOG.error(
                f"Must use custom json file or one of the spacekit collections: {list(spacekit_collections.keys())}"
            )
    elif src == "s3":
        SPACEKIT_LOG.info("Scraping S3")
        scraper = S3Scraper(
            archive, pfx=S3PREFIX, cache_dir=cache_dir, cache_subdir=cache_subdir
        )
        scraper.make_s3_keys(fnames=datasets)
    elif src == "file":
        SPACEKIT_LOG.info("Scraping local directory")
        p = [f"{archive}/*.zip", f"{archive}/*"]
        scraper = FileScraper(
            patterns=p, clean=False, cache_dir=cache_dir, cache_subdir=cache_subdir
        )
    if scraper:
        try:
            scraper.scrape()
            if scraper.fpaths:
                SPACEKIT_LOG.info(f"Datasets: {scraper.fpaths}")
                return scraper.fpaths
        except Exception as e:
            SPACEKIT_LOG.error(f"Could not locate datasets {e}")
            sys.exit(1)
    elif fpaths:
        return fpaths


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--scrape",
        default="web:calcloud",
        help="Uses a key:uri format where options for the key are limited to web, s3, or file. \
        The uri could be your own custom location if not using the default datasets.  \
        Examples are web:calcloud, web:custom.json, s3:mybucket, file:myfolder. \
        Visit spacekit.readthedocs.io for more info.",
    )
    parser.add_argument(
        "-d",
        "--datasets",
        default="2022-02-14,2021-11-04,2021-10-28",
        help="Comma-separated string of keys identifying each dataset",
    )
    parser.add_argument("-o", "--out", default=None)
    args = parser.parse_args()
    fpaths = download(scrape=args.scrape, datasets=args.datasets, dest=args.out)