Source code for cadl.seq2seq

"""Sequence to Sequence models w/ Attention and BiDirectional Dynamic RNNs.
"""
"""
Copyright 2017 Parag K. Mital.  See also NOTICE.md.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import tensorflow as tf
import numpy as np
import nltk
import pickle
from cadl import cornell

# Special vocabulary symbols:
# PAD is used to pad a sequence to a fixed size
# GO is for the end of the encoding
# EOS is for the end of decoding
# UNK is for out of vocabulary words
_PAD, _GO, _EOS, _UNK = "_PAD", "_GO", "_EOS", "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]
PAD_ID, GO_ID, EOS_ID, UNK_ID = range(4)


def _create_embedding(x, vocab_size, embed_size, embed_matrix=None):
    """Summary

    Parameters
    ----------
    x : TYPE
        Description
    vocab_size : TYPE
        Description
    embed_size : TYPE
        Description
    embed_matrix : None, optional
        Description

    Returns
    -------
    TYPE
        Description
    """
    # Creating an embedding matrix if one isn't given
    if embed_matrix is None:
        # This is a big matrix
        embed_matrix = tf.get_variable(
            name="embedding_matrix",
            shape=[vocab_size, embed_size],
            dtype=tf.float32,
            initializer=tf.random_uniform_initializer(-1.0, 1.0))

    # Perform the lookup of ids in x and perform the embedding to embed_size
    # [batch_size, max_time, embed_size]
    embed = tf.nn.embedding_lookup(embed_matrix, x)

    return embed, embed_matrix


def _create_rnn_cell(n_neurons, n_layers, keep_prob):
    """Summary

    Parameters
    ----------
    n_neurons : TYPE
        Description
    n_layers : TYPE
        Description
    keep_prob : TYPE
        Description

    Returns
    -------
    TYPE
        Description
    """
    import tensorflow.contrib.rnn as rnn

    cell_fw = rnn.LayerNormBasicLSTMCell(
        num_units=n_neurons, dropout_keep_prob=keep_prob)
    # Build deeper recurrent net if using more than 1 layer
    if n_layers > 1:
        cells = [cell_fw]
        for layer_i in range(1, n_layers):
            with tf.variable_scope('{}'.format(layer_i)):
                cell_fw = rnn.LayerNormBasicLSTMCell(
                    num_units=n_neurons, dropout_keep_prob=keep_prob)
                cells.append(cell_fw)
        cell_fw = rnn.MultiRNNCell(cells)
    return cell_fw


def _create_encoder(embed, lengths, batch_size, n_enc_neurons, n_layers,
                    keep_prob):
    """Summary

    Parameters
    ----------
    embed : TYPE
        Description
    lengths : TYPE
        Description
    batch_size : TYPE
        Description
    n_enc_neurons : TYPE
        Description
    n_layers : TYPE
        Description
    keep_prob : TYPE
        Description

    Returns
    -------
    TYPE
        Description
    """
    # Create the RNN Cells for encoder
    with tf.variable_scope('forward'):
        cell_fw = _create_rnn_cell(n_enc_neurons, n_layers, keep_prob)

    # Create the internal multi-layer cell for the backward RNN.
    with tf.variable_scope('backward'):
        cell_bw = _create_rnn_cell(n_enc_neurons, n_layers, keep_prob)

    # Now hookup the cells to the input
    # [batch_size, max_time, embed_size]
    (outputs, final_state) = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=cell_fw,
        cell_bw=cell_bw,
        inputs=embed,
        sequence_length=lengths,
        time_major=False,
        dtype=tf.float32)

    return outputs, final_state


def _create_decoder(cells,
                    batch_size,
                    encoder_outputs,
                    encoder_state,
                    encoder_lengths,
                    decoding_inputs,
                    decoding_lengths,
                    embed_matrix,
                    target_vocab_size,
                    scope,
                    max_sequence_size,
                    use_attention=True):
    """Summary

    Parameters
    ----------
    cells : TYPE
        Description
    batch_size : TYPE
        Description
    encoder_outputs : TYPE
        Description
    encoder_state : TYPE
        Description
    encoder_lengths : TYPE
        Description
    decoding_inputs : TYPE
        Description
    decoding_lengths : TYPE
        Description
    embed_matrix : TYPE
        Description
    target_vocab_size : TYPE
        Description
    scope : TYPE
        Description
    max_sequence_size : TYPE
        Description
    use_attention : bool, optional
        Description

    Returns
    -------
    TYPE
        Description
    """
    from tensorflow.python.layers.core import Dense

    # Output projection
    output_layer = Dense(target_vocab_size, name='output_projection')

    # Setup Attention
    if use_attention:
        attn_mech = tf.contrib.seq2seq.LuongAttention(
            cells.output_size, encoder_outputs, encoder_lengths, scale=True)
        cells = tf.contrib.seq2seq.AttentionWrapper(
            cell=cells,
            attention_mechanism=attn_mech,
            attention_layer_size=cells.output_size,
            alignment_history=False)
        initial_state = cells.zero_state(
            dtype=tf.float32, batch_size=batch_size)
        initial_state = initial_state.clone(cell_state=encoder_state)

    # Setup training a build decoder
    helper = tf.contrib.seq2seq.TrainingHelper(
        inputs=decoding_inputs,
        sequence_length=decoding_lengths,
        time_major=False)
    train_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=cells,
        helper=helper,
        initial_state=initial_state,
        output_layer=output_layer)
    train_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
        train_decoder,
        output_time_major=False,
        impute_finished=True,
        maximum_iterations=max_sequence_size)
    train_logits = tf.identity(train_outputs.rnn_output, name='train_logits')

    # Setup inference and build decoder
    scope.reuse_variables()
    start_tokens = tf.tile(tf.constant([GO_ID], dtype=tf.int32), [batch_size])
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
        embedding=embed_matrix, start_tokens=start_tokens, end_token=EOS_ID)
    infer_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=cells,
        helper=helper,
        initial_state=initial_state,
        output_layer=output_layer)
    infer_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
        infer_decoder,
        output_time_major=False,
        impute_finished=True,
        maximum_iterations=max_sequence_size)
    infer_logits = tf.identity(infer_outputs.sample_id, name='infer_logits')

    return train_logits, infer_logits


[docs]def create_model(source_vocab_size=10000,
                 target_vocab_size=10000,
                 input_embed_size=512,
                 target_embed_size=512,
                 share_input_and_target_embedding=True,
                 n_neurons=512,
                 n_layers=4,
                 use_attention=True,
                 max_sequence_size=30):
    """Summary

    Parameters
    ----------
    source_vocab_size : int, optional
        Description
    target_vocab_size : int, optional
        Description
    input_embed_size : int, optional
        Description
    target_embed_size : int, optional
        Description
    share_input_and_target_embedding : bool, optional
        Description
    n_neurons : int, optional
        Description
    n_layers : int, optional
        Description
    use_attention : bool, optional
        Description
    max_sequence_size : int, optional
        Description

    Returns
    -------
    TYPE
        Description

    Raises
    ------
    ValueError
        Description
    """
    n_enc_neurons = n_neurons
    n_dec_neurons = n_neurons

    # First sentence (i.e. input, original language sentence before translation)
    # [batch_size, max_time]
    source = tf.placeholder(tf.int32, shape=(None, None), name='source')

    # User should also pass in the sequence lengths
    source_lengths = tf.placeholder(
        tf.int32, shape=(None,), name='source_lengths')

    # Second sentence (i.e. reply, translation, etc...)
    # [batch_size, max_time]
    target = tf.placeholder(tf.int32, shape=(None, None), name='target')

    # User should also pass in the sequence lengths
    target_lengths = tf.placeholder(
        tf.int32, shape=(None,), name='target_lengths')

    # Dropout
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    # Symbolic shapes
    batch_size, sequence_length = tf.unstack(tf.shape(source))

    # Get the input to the decoder by removing last element
    # and adding a 'go' symbol as first element
    with tf.variable_scope('target/slicing'):
        slice = tf.slice(target, [0, 0], [batch_size, -1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO_ID), slice], 1)

    # Embed word ids to target embedding
    with tf.variable_scope('source/embedding'):
        source_embed, source_embed_matrix = _create_embedding(
            x=source, vocab_size=source_vocab_size, embed_size=input_embed_size)

    # Embed word ids for target embedding
    with tf.variable_scope('target/embedding'):
        # Check if we need a new embedding matrix or not.  If we are for
        # instance translating to another language, then we'd need different
        # vocabularies for the input and outputs, and so new embeddings.
        # However if we are for instance building a chatbot with the same
        # language, then it doesn't make sense to have different embeddings and
        # we should share them.
        if (share_input_and_target_embedding and
                source_vocab_size == target_vocab_size):
            target_input_embed, target_embed_matrix = _create_embedding(
                x=decoder_input,
                vocab_size=target_vocab_size,
                embed_size=target_embed_size,
                embed_matrix=source_embed_matrix)
        elif source_vocab_size != target_vocab_size:
            raise ValueError(
                'source_vocab_size must equal target_vocab_size if ' +
                'sharing input and target embeddings')
        else:
            target_input_embed, target_embed_matrix = _create_embedding(
                x=target,
                vocab_size=target_vocab_size,
                embed_size=target_embed_size)

    # Build the encoder
    with tf.variable_scope('encoder'):
        encoder_outputs, encoder_state = _create_encoder(
            embed=source_embed,
            lengths=source_lengths,
            batch_size=batch_size,
            n_enc_neurons=n_enc_neurons,
            n_layers=n_layers,
            keep_prob=keep_prob)

    # Build the decoder
    with tf.variable_scope('decoder') as scope:
        cell_fw = _create_rnn_cell(n_dec_neurons, n_layers, keep_prob)
        decoding_train_logits, decoding_infer_logits = _create_decoder(
            cells=cell_fw,
            batch_size=batch_size,
            encoder_outputs=encoder_outputs[0],
            encoder_state=encoder_state[0],
            encoder_lengths=source_lengths,
            decoding_inputs=target_input_embed,
            decoding_lengths=target_lengths,
            embed_matrix=target_embed_matrix,
            target_vocab_size=target_vocab_size,
            scope=scope,
            max_sequence_size=max_sequence_size)

    with tf.variable_scope('loss'):
        weights = tf.cast(tf.sequence_mask(target_lengths), tf.float32)
        loss = tf.contrib.seq2seq.sequence_loss(
            logits=tf.reshape(decoding_train_logits, [
                batch_size, tf.reduce_max(target_lengths), target_vocab_size
            ]),
            targets=target,
            weights=weights)

    return {
        'loss': loss,
        'source': source,
        'source_lengths': source_lengths,
        'target': target,
        'target_lengths': target_lengths,
        'keep_prob': keep_prob,
        'thought_vector': encoder_state,
        'decoder': decoding_infer_logits
    }


[docs]def batch_generator(sources,
                    targets,
                    source_lengths,
                    target_lengths,
                    batch_size=50):
    """Summary

    Parameters
    ----------
    sources : TYPE
        Description
    targets : TYPE
        Description
    source_lengths : TYPE
        Description
    target_lengths : TYPE
        Description
    batch_size : int, optional
        Description

    Yields
    ------
    TYPE
        Description
    """
    idxs = np.random.permutation(np.arange(len(sources)))
    n_batches = len(idxs) // batch_size
    for batch_i in range(n_batches):
        this_idxs = idxs[batch_i * batch_size:(batch_i + 1) * batch_size]
        this_sources, this_targets = sources[this_idxs, :], targets[
            this_idxs, :]
        this_source_lengths, this_target_lengths = source_lengths[
            this_idxs], target_lengths[this_idxs]
        yield (this_sources[:, :np.max(this_source_lengths)],
               this_targets[:, :np.max(this_target_lengths)],
               this_source_lengths, this_target_lengths)


[docs]def preprocess(text, min_count=5, min_length=3, max_length=30):
    """Summary

    Parameters
    ----------
    text : TYPE
        Description
    min_count : int, optional
        Description
    min_length : int, optional
        Description
    max_length : int, optional
        Description

    Returns
    -------
    TYPE
        Description
    """
    sentences = [el for s in text for el in nltk.sent_tokenize(s)]

    # We'll first tokenize each sentence into words to get a sense of
    # how long each sentence is:
    words = [[word.lower() for word in nltk.word_tokenize(s)]
             for s in sentences]

    # Then see how long each sentence is:
    lengths = np.array([len(s) for s in words])

    good_idxs = np.where((lengths >= min_length) & (lengths < max_length))[0]
    dataset = [words[idx] for idx in good_idxs]
    fdist = nltk.FreqDist([word for sentence in dataset for word in sentence])

    vocab_counts = [el for el in fdist.most_common() if el[1] > min_count]

    # First sort the vocabulary
    vocab = [v[0] for v in vocab_counts]
    vocab.sort()

    # Now add the special symbols:
    vocab = _START_VOCAB + vocab

    # Then create the word to id mapping
    vocab = {k: v for v, k in enumerate(vocab)}

    with open('vocab.pkl', 'wb') as fp:
        pickle.dump(vocab, fp)

    unked = word2id(dataset, vocab)
    return unked, vocab


[docs]def word2id(words, vocab):
    """Summary

    Parameters
    ----------
    words : TYPE
        Description
    vocab : TYPE
        Description

    Returns
    -------
    TYPE
        Description
    """
    unked = []
    for s in words:
        this_sentence = [vocab.get(w, UNK_ID) for w in s]
        unked.append(this_sentence)
    return unked


[docs]def id2word(ids, vocab):
    """Summary

    Parameters
    ----------
    ids : TYPE
        Description
    vocab : TYPE
        Description

    Returns
    -------
    TYPE
        Description
    """
    words = []
    id2words = {v: k for k, v in vocab.items()}
    for s in ids:
        this_sentence = [id2words.get(w) for w in s]
        words.append(this_sentence)
    return words


[docs]def train(text,
          max_sequence_size=20,
          use_attention=True,
          min_count=25,
          min_length=5,
          n_epochs=1000,
          batch_size=100):
    """Summary

    Parameters
    ----------
    text : TYPE
        Description
    max_sequence_size : int, optional
        Description
    use_attention : bool, optional
        Description
    min_count : int, optional
        Description
    min_length : int, optional
        Description
    n_epochs : int, optional
        Description
    batch_size : int, optional
        Description
    """
    # Preprocess it to word IDs including UNKs for out of vocabulary words
    unked, vocab = preprocess(
        text,
        min_count=min_count,
        min_length=min_length,
        max_length=max_sequence_size - 1)

    # Get the vocabulary size
    vocab_size = len(vocab)

    # Create input output pairs formed by neighboring sentences of dialog
    sources_list, targets_list = unked[:-1], unked[1:]

    # Store the final lengths
    source_lengths = np.zeros((len(sources_list)), dtype=np.int32)
    target_lengths = np.zeros((len(targets_list)), dtype=np.int32)
    sources = np.ones(
        (len(sources_list), max_sequence_size), dtype=np.int32) * PAD_ID
    targets = np.ones(
        (len(targets_list), max_sequence_size), dtype=np.int32) * PAD_ID

    for i, (source_i, target_i) in enumerate(zip(sources_list, targets_list)):
        el = source_i
        source_lengths[i] = len(el)
        sources[i, :len(el)] = el

        el = target_i + [EOS_ID]
        target_lengths[i] = len(el)
        targets[i, :len(el)] = el

    sess = tf.Session()

    net = create_model(
        max_sequence_size=max_sequence_size,
        use_attention=use_attention,
        source_vocab_size=vocab_size,
        target_vocab_size=vocab_size)

    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    opt = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(net['loss'])
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    saver = tf.train.Saver()

    def decode(tokens, lengths):
        """Summary

        Parameters
        ----------
        tokens : TYPE
            Description
        lengths : TYPE
            Description
        """
        decoding = sess.run(
            net['decoder'],
            feed_dict={
                net['keep_prob']: 1.0,
                net['source']: tokens,
                net['source_lengths']: lengths
            })
        print('input:', " ".join(id2word([tokens[0]], vocab)[0]))
        print('output:', " ".join(id2word([decoding[0]], vocab)[0]))

    current_learning_rate = 0.01
    for epoch_i in range(n_epochs):
        total = 0
        for it_i, (this_sources, this_targets, this_source_lengths, this_target_lengths) \
            in enumerate(batch_generator(
                sources, targets, source_lengths, target_lengths, batch_size=batch_size)):
            if it_i % 1000 == 0:
                current_learning_rate = max(0.0001,
                                            current_learning_rate * 0.99)
                print(it_i)
                decode(this_sources[0:1], this_source_lengths[0:1])
            l = sess.run(
                [net['loss'], opt],
                feed_dict={
                    learning_rate: current_learning_rate,
                    net['keep_prob']: 0.8,
                    net['source']: this_sources,
                    net['target']: this_targets,
                    net['source_lengths']: this_source_lengths,
                    net['target_lengths']: this_target_lengths
                })[0]
            total = total + l
            print('{}: {}'.format(it_i, total / (it_i + 1)), end='\r')
        # End of epoch, save
        print('epoch {}: {}'.format(epoch_i, total / it_i))
        saver.save(sess, './dynamic-seq2seq.ckpt', global_step=it_i)

    sess.close()


[docs]def train_cornell(**kwargs):
    """Summary

    Parameters
    ----------
    **kwargs
        Description

    Returns
    -------
    TYPE
        Description
    """
    # Get the cornell dataset text
    text = cornell.get_scripts()
    return train(text, **kwargs)