Skip to content

llm.preprocess.shard

Pretraining test sharder.

python -m llm.preprocess.shard --help

cli()

cli(
    inputs: tuple[str],
    output_dir: str,
    size: str,
    format: str,
    shuffle: bool,
    log_level: str,
    rich: bool,
) -> None

Shard documents in FILEPATHS into equally sized files.

Source code in llm/preprocess/shard.py
@click.command()
@click.argument(
    'inputs',
    metavar='FILEPATHS',
    required=True,
    nargs=-1,
)
@click.option(
    '-o',
    '--output-dir',
    metavar='PATH',
    required=True,
    help='Output directory for encoded shards.',
)
@click.option(
    '-s',
    '--size',
    metavar='SIZE',
    required=True,
    help='Max data size of each shard.',
)
@click.option(
    '-f',
    '--format',
    default='shard-{index}.txt',
    help='Shard name format where {index} is replaced by shard index.',
)
@click.option(
    '--shuffle/--no-shuffle',
    default=False,
    help='Shuffle documents before sharding.',
)
@click.option(
    '--log-level',
    default='INFO',
    type=click.Choice(
        ['DEBUG', 'INFO', 'WARNING', 'ERROR'],
        case_sensitive=False,
    ),
    help='Minimum logging level.',
)
@click.option(
    '--rich/--no-rich',
    default=False,
    help='Use rich output formatting.',
)
def cli(
    inputs: tuple[str],
    output_dir: str,
    size: str,
    format: str,  # noqa: A002
    shuffle: bool,
    log_level: str,
    rich: bool,
) -> None:
    """Shard documents in FILEPATHS into equally sized files."""
    init_logging(log_level, rich=rich)

    size_bytes = readable_to_bytes(size)

    shard(inputs, output_dir, format, size_bytes, shuffle)