Source code for cadl.librispeech

"""LibriSpeech dataset, batch processing, and preprocessing.
"""
"""
Copyright 2017 Parag K. Mital.  See also NOTICE.md.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
from scipy.io import wavfile
from cadl.utils import download_and_extract_tar
from glob import glob
import subprocess
import numpy as np


[docs]def get_dataset(saveto='librispeech', convert_to_wav=False, kind='dev'):
    """Download the LibriSpeech dataset and convert to wav files.

    More info: http://www.openslr.org/12/

    This interface downloads the LibriSpeech dataset and attempts to
    convert the flac to wave files using ffmpeg.  If you do not have ffmpeg
    installed, this function will not be able to convert the files to waves.

    Parameters
    ----------
    saveto : str
        Directory to save the resulting dataset ['librispeech']
    convert_to_wav : bool, optional
        Description
    kind : str, optional
        Description

    Returns
    -------
    TYPE
        Description
    """
    if not os.path.exists(saveto):
        if kind == 'dev':
            download_and_extract_tar(
                'http://www.openslr.org/resources/12/dev-clean.tar.gz', saveto)
        elif kind == 'train-100':
            download_and_extract_tar(
                'http://www.openslr.org/resources/12/train-clean-100.tar.gz',
                saveto)
        elif kind == 'train-360':
            download_and_extract_tar(
                'http://www.openslr.org/resources/12/train-clean-360.tar.gz',
                saveto)
        else:
            print('Not downloading.  Pass in either ["dev"],'
                  '"train-100", or "train-360", in order to '
                  'download the dataset.')

    wavs = glob('{}/**/*.wav'.format(saveto), recursive=True)
    if convert_to_wav:
        if len(wavs) == 0:
            flacs = glob('{}/**/*.flac'.format(saveto), recursive=True)
            for f in flacs:
                subprocess.check_call(
                    ['ffmpeg', '-i', f, '-f', 'wav', '-y', '%s.wav' % f])
            wavs = glob('{}/**/*.wav'.format(saveto), recursive=True)
        else:
            print('WARNING: Found existing wave files.  Not converting!')

    dataset = []
    for wav_i in wavs:
        id_i, chapter_i, utter_i = wav_i.split('/')[-3:]
        dataset.append({
            'name': wav_i,
            'id': id_i,
            'chapter': chapter_i,
            'utterance': utter_i.split('-')[-1].strip('.wav')
        })
    if len(wavs) == 0:
        print('LibriSpeech is a FLAC dataset.  Consider rerunning this '
              'command with convert_to_wav=True, to use ffmpeg to '
              'convert the flac files to wave files first. This requires '
              'the use of ffmpeg and so this should be installed first.')
    return dataset


[docs]def batch_generator(dataset,
                    batch_size=32,
                    max_sequence_length=6144,
                    maxval=32768.0,
                    threshold=0.2,
                    normalize=True):
    """Summary

    Parameters
    ----------
    dataset : TYPE
        Description
    batch_size : int, optional
        Description
    max_sequence_length : int, optional
        Description
    maxval : float, optional
        Description
    threshold : float, optional
        Description
    normalize : bool, optional
        Description

    Yields
    ------
    TYPE
        Description
    """
    n_batches = len(dataset) // batch_size
    for batch_i in range(n_batches):
        cropped_wavs, ids = [], []
        while len(cropped_wavs) < batch_size:
            idx_i = np.random.choice(np.arange(len(dataset)))
            fname_i = dataset[idx_i]['name']
            id_i = dataset[idx_i]['id']
            wav_i = wavfile.read(fname_i)[1]
            sample = np.random.choice(range(len(wav_i) - max_sequence_length))
            cropped_wav = wav_i[sample:sample + max_sequence_length]
            if np.max(np.abs(cropped_wav) / maxval) > threshold:
                if normalize:
                    cropped_wav = cropped_wav / maxval
                cropped_wavs.append(cropped_wav)
                ids.append(id_i)
        yield np.array(cropped_wavs, np.float32), np.array(ids, np.int32)