@click.command()
@click.option(
'-d',
'--dataset',
type=click.Choice(['wikipedia', 'bookscorpus'], case_sensitive=False),
required=True,
help='Dataset to download.',
)
@click.option(
'-o',
'--output-dir',
metavar='PATH',
required=True,
help='Output directory.',
)
@click.option(
'--log-level',
default='INFO',
type=click.Choice(
['DEBUG', 'INFO', 'WARNING', 'ERROR'],
case_sensitive=False,
),
help='Minimum logging level.',
)
@click.option(
'--rich/--no-rich',
default=False,
help='Use rich output formatting.',
)
def cli(
dataset: Literal['wikipedia', 'bookcorpus'],
output_dir: str,
log_level: str,
rich: bool,
) -> None:
"""Pretraining text downloader."""
init_logging(log_level, rich=rich)
if dataset == 'wikipedia':
download_wikipedia(output_dir)
elif dataset == 'bookscorpus':
download_bookscorpus(output_dir)
else:
raise AssertionError('Unreachable.')