Skip to content

llm.preprocess.roberta

RoBERTa pretraining encoder.

This implements the DOC-SENTENCES sampling strategy of RoBERTa. Samples are not pre-masked as in Devlin et al. and are rather dynamically masked at runtime as in RoBERTa. Next sentence prediction is also not used.

python -m llm.preprocess.roberta --help

cli()

cli(
    inputs: tuple[str],
    output_dir: str,
    tokenizer: str,
    max_seq_len: int,
    short_seq_prob: float,
    processes: int,
    log_level: str,
    rich: bool,
) -> None

Encode FILEPATHS for RoBERTa pretraining.

Source code in llm/preprocess/roberta.py
@click.command()
@click.argument(
    'inputs',
    metavar='FILEPATHS',
    required=True,
    nargs=-1,
)
@click.option(
    '-o',
    '--output-dir',
    metavar='PATH',
    required=True,
    help='Output directory for encoded shards.',
)
@click.option(
    '-t',
    '--tokenizer',
    metavar='PATH',
    required=True,
    help='Path to trained tokenizer to load.',
)
@click.option(
    '-l',
    '--max-seq-len',
    type=int,
    default=512,
    help='Maximum sequence length.',
)
@click.option(
    '-s',
    '--short-seq-prob',
    type=float,
    default=0.1,
    help='Probablity to create shorter sequences.',
)
@click.option(
    '-p',
    '--processes',
    type=int,
    default=4,
    help='Number of processes for concurrent shard encoding.',
)
@click.option(
    '--log-level',
    default='INFO',
    type=click.Choice(
        ['DEBUG', 'INFO', 'WARNING', 'ERROR'],
        case_sensitive=False,
    ),
    help='Minimum logging level.',
)
@click.option(
    '--rich/--no-rich',
    default=False,
    help='Use rich output formatting.',
)
def cli(
    inputs: tuple[str],
    output_dir: str,
    tokenizer: str,
    max_seq_len: int,
    short_seq_prob: float,
    processes: int,
    log_level: str,
    rich: bool,
) -> None:
    """Encode FILEPATHS for RoBERTa pretraining."""
    init_logging(log_level, rich=rich)

    # This silences the error:
    #
    # The current process just got forked, after parallelism has already been
    # used. Disabling parallelism to avoid deadlocks...
    # To disable this warning, you can either:
    #   - Avoid using `tokenizers` before the fork if possible
    #   - Explicitly set the environment variable TOKENIZERS_PARALLELISM=false
    #
    # Note we set this in a few places to ensure the environment variable
    # is set in subprocesses.
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'

    encode_files(
        input_files=inputs,
        output_dir=output_dir,
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        short_seq_prob=short_seq_prob,
        processes=processes,
    )