CN-EN-Translation-BERT
日萌社
人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)
github下载代码
https://github.com/dragen1860/Deep-Learning-with-TensorFlow-book
https://github.com/dragen1860/TensorFlow-2.x-Tutorials
attention.py
import tensorflow as tfdef scaled_dot_product_attention(q, k, v, mask):matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)# scale matmul_qkdk = tf.cast(tf.shape(k)[-1], tf.float32)scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)# add the mask to the scaled tensor.if mask is not None:scaled_attention_logits += (mask * -1e9)# softmax is normalized on the last axis (seq_len_k) so that the scores# add up to 1.attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)return output, attention_weights# ## Multi-head attention# In[ ]:class MultiHeadAttention(tf.keras.layers.Layer):def __init__(self, d_model, num_heads):super(MultiHeadAttention, self).__init__()self.num_heads = num_headsself.d_model = d_modelassert d_model % self.num_heads == 0self.depth = d_model // self.num_headsself.wq = tf.keras.layers.Dense(d_model)self.wk = tf.keras.layers.Dense(d_model)self.wv = tf.keras.layers.Dense(d_model)self.dense = tf.keras.layers.Dense(d_model)def split_heads(self, x, batch_size):"""Split the last dimension into (num_heads, depth).Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)"""x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))return tf.transpose(x, perm=[0, 2, 1, 3])def call(self, v, k, q, mask):batch_size = tf.shape(q)[0]q = self.wq(q) # (batch_size, seq_len, d_model)k = self.wk(k) # (batch_size, seq_len, d_model)v = self.wv(v) # (batch_size, seq_len, d_model)q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)# scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)scaled_attention = tf.transpose(scaled_attention,perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)concat_attention = tf.reshape(scaled_attention,(batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model)return output, attention_weightsdef main():temp_mha = MultiHeadAttention(d_model=512, num_heads=8)y = tf.random.uniform((1, 60, 768)) # (batch_size, encoder_sequence, d_model)q = tf.random.uniform((1, 60, 512)) # (batch_size, encoder_sequence, d_model)out, attn = temp_mha(y, k=y, q=q, mask=None)out.shape, attn.shapeif __name__ == '__main__':main()
attlayer.py
import tensorflow as tf
from attention import MultiHeadAttention
from utils import positional_encodingdef point_wise_feed_forward_network(d_model, dff):return tf.keras.Sequential([tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff)tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model)])# ## Decoder Layer and Decoderclass EncoderLayer(tf.keras.layers.Layer):def __init__(self, d_model, num_heads, dff, rate=0.1):super(EncoderLayer, self).__init__()self.mha = MultiHeadAttention(d_model, num_heads)self.ffn = point_wise_feed_forward_network(d_model, dff)self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)self.dropout1 = tf.keras.layers.Dropout(rate)self.dropout2 = tf.keras.layers.Dropout(rate)def call(self, x, training, mask):attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)attn_output = self.dropout1(attn_output, training=training)out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)ffn_output = self.dropout2(ffn_output, training=training)out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)return out2# Each decoder layer consists of sublayers:
#
# 1. Masked multi-head attention (with look ahead mask and padding mask)
#
# 2. Multi-head attention (with padding mask). V (value) and K (key) receive the encoder output as inputs. Q (query) receives the output from the masked multi-head attention sublaye
#
# 3. Point wise feed forward networks
#
# Each of these sublayers has a residual connection around it followed by a layer normalization. The output of each sublayer is LayerNorm(x + Sublayer(x)). The normalization is done on the d_model (last) axis.# In[ ]:class DecoderLayer(tf.keras.layers.Layer):def __init__(self, d_model, num_heads, dff, rate=0.1):super(DecoderLayer, self).__init__()self.mha1 = MultiHeadAttention(d_model, num_heads)self.mha2 = MultiHeadAttention(d_model, num_heads)self.ffn = point_wise_feed_forward_network(d_model, dff)self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)self.dropout1 = tf.keras.layers.Dropout(rate)self.dropout2 = tf.keras.layers.Dropout(rate)self.dropout3 = tf.keras.layers.Dropout(rate)def call(self, x, enc_output, training,look_ahead_mask, padding_mask):# enc_output.shape == (batch_size, input_seq_len, d_model)attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)attn1 = self.dropout1(attn1, training=training)out1 = self.layernorm1(attn1 + x)attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)attn2 = self.dropout2(attn2, training=training)out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)ffn_output = self.dropout3(ffn_output, training=training)out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)return out3, attn_weights_block1, attn_weights_block2def main():# In[ ]:sample_encoder_layer = EncoderLayer(512, 8, 2048)sample_encoder_layer_output = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None)sample_encoder_layer_output.shape # (batch_size, input_seq_len, d_model)sample_decoder_layer = DecoderLayer(512, 8, 2048)sample_encoder_output = tf.random.uniform((64, 128, 768))sample_decoder_layer_output, _, _ = sample_decoder_layer(tf.random.uniform((64, 50, 512)), sample_encoder_output,False, None, None)sample_decoder_layer_output.shape # (batch_size, target_seq_len, d_model)if __name__ == '__main__':main()
bert_train.py
import tensorflow as tfimport time
import numpy as np
import matplotlib.pyplot as plt
import osos.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:try:# Currently, memory growth needs to be the same across GPUsfor gpu in gpus:tf.config.experimental.set_memory_growth(gpu, True)logical_gpus = tf.config.experimental.list_logical_devices('GPU')print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")except RuntimeError as e:# Memory growth must be set before GPUs have been initializedprint(e)from tokenizer import get_tokenizer
from bertmodel import Transformer, Config
from utils import CustomSchedule, create_masks
from test import TranslatorBUFFER_SIZE = 50000
BATCH_SIZE = 64
MAX_SEQ_LENGTH = 128train_dataset, val_dataset, tokenizer_en, tokenizer_zh = \get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE)config = Config(num_layers=6, d_model=256, dff=1024, num_heads=8)target_vocab_size = tokenizer_en.vocab_size + 2
dropout_rate = 0.1MODEL_DIR = "chinese_L-12_H-768_A-12"
bert_config_file = os.path.join(MODEL_DIR, "bert_config.json")
bert_ckpt_file = os.path.join(MODEL_DIR, "bert_model.ckpt")transformer = Transformer(config=config,target_vocab_size=target_vocab_size,bert_config_file=bert_config_file)inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
fn_out, _ = transformer(inp, tar_inp,True,enc_padding_mask=None,look_ahead_mask=None,dec_padding_mask=None)
print(tar_inp.shape) # (batch_size, tar_seq_len)
print(fn_out.shape) # (batch_size, tar_seq_len, target_vocab_size) # init bert pre-trained weights
transformer.restore_encoder(bert_ckpt_file)
transformer.summary()learning_rate = CustomSchedule(config.d_model)optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,epsilon=1e-9)loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')def loss_function(real, pred):mask = tf.math.logical_not(tf.math.equal(real, 0))loss_ = loss_object(real, pred)mask = tf.cast(mask, dtype=loss_.dtype)loss_ *= maskreturn tf.reduce_mean(loss_)train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')checkpoint_path = "./zh-en/bert"ckpt = tf.train.Checkpoint(transformer=transformer,optimizer=optimizer)ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:ckpt.restore(ckpt_manager.latest_checkpoint)print('Latest checkpoint restored!!')@tf.function
def train_step(inp, tar):tar_inp = tar[:, :-1]tar_real = tar[:, 1:]enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)with tf.GradientTape() as tape:predictions, _ = transformer(inp, tar_inp,True,enc_padding_mask,combined_mask,dec_padding_mask)loss = loss_function(tar_real, predictions)gradients = tape.gradient(loss, transformer.trainable_variables)optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))train_loss(loss)train_accuracy(tar_real, predictions)# Chinese is used as the input language and English is the target language.
translator = Translator(tokenizer_zh, tokenizer_en, transformer, MAX_SEQ_LENGTH)for epoch in range(4):res = translator.do('虽然继承了祖荫,但朴槿惠已经证明了自己是个机敏而老练的政治家。')start = time.time()train_loss.reset_states()train_accuracy.reset_states()# inp -> chinese, tar -> englishfor (batch, (inp, tar)) in enumerate(train_dataset):train_step(inp, tar)if batch % 500 == 0:print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result()))if (epoch + 1) % 1 == 0:ckpt_save_path = ckpt_manager.save()print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,ckpt_save_path))print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,train_loss.result(),train_accuracy.result()))print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
bertmodel.py
import tensorflow as tffrom bert import BertModelLayer
from bert.loader import StockBertConfig, load_stock_weights
from bert.loader import map_to_stock_variable_namefrom utils import positional_encoding
from attlayer import DecoderLayerclass Config(object):def __init__(self, num_layers, d_model, dff, num_heads):self.num_layers = num_layersself.d_model = d_modelself.dff = dffself.num_heads = num_heads# In[ ]:def build_encoder(config_file):with tf.io.gfile.GFile(config_file, "r") as reader:stock_params = StockBertConfig.from_json_string(reader.read())bert_params = stock_params.to_bert_model_layer_params()return BertModelLayer.from_params(bert_params, name="bert")class Decoder(tf.keras.layers.Layer):def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,rate=0.1):super(Decoder, self).__init__()self.d_model = d_modelself.num_layers = num_layersself.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)for _ in range(num_layers)]self.dropout = tf.keras.layers.Dropout(rate)def call(self, x, enc_output, training,look_ahead_mask, padding_mask):seq_len = tf.shape(x)[1]attention_weights = {}x = self.embedding(x) # (batch_size, target_seq_len, d_model)x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))x += self.pos_encoding[:, :seq_len, :]x = self.dropout(x, training=training)for i in range(self.num_layers):x, block1, block2 = self.dec_layers[i](x, enc_output, training,look_ahead_mask, padding_mask)attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2# x.shape == (batch_size, target_seq_len, d_model)return x, attention_weightsclass Transformer(tf.keras.Model):def __init__(self, config,target_vocab_size,bert_config_file,bert_training=False,rate=0.1,name='transformer'):super(Transformer, self).__init__(name=name)self.encoder = build_encoder(config_file=bert_config_file)self.encoder.trainable = bert_trainingself.decoder = Decoder(config.num_layers, config.d_model,config.num_heads, config.dff, target_vocab_size, rate)self.final_layer = tf.keras.layers.Dense(target_vocab_size)def load_stock_weights(self, bert: BertModelLayer, ckpt_file):assert isinstance(bert, BertModelLayer), "Expecting a BertModelLayer instance as first argument"assert tf.compat.v1.train.checkpoint_exists(ckpt_file), "Checkpoint does not exist: {}".format(ckpt_file)ckpt_reader = tf.train.load_checkpoint(ckpt_file)bert_prefix = 'transformer/bert'weights = []for weight in bert.weights:stock_name = map_to_stock_variable_name(weight.name, bert_prefix)if ckpt_reader.has_tensor(stock_name):value = ckpt_reader.get_tensor(stock_name)weights.append(value)else:raise ValueError("No value for:[{}], i.e.:[{}] in:[{}]".format(weight.name, stock_name, ckpt_file))bert.set_weights(weights)print("Done loading {} BERT weights from: {} into {} (prefix:{})".format(len(weights), ckpt_file, bert, bert_prefix))def restore_encoder(self, bert_ckpt_file):# loading the original pre-trained weights into the BERT layer:self.load_stock_weights(self.encoder, bert_ckpt_file)def call(self, inp, tar, training, enc_padding_mask,look_ahead_mask, dec_padding_mask):enc_output = self.encoder(inp, training=self.encoder.trainable) # (batch_size, inp_seq_len, d_model)# dec_output.shape == (batch_size, tar_seq_len, d_model)dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size)return final_output, attention_weights
test.py
import tensorflow as tfimport time
import numpy as np
import matplotlib.pyplot as plt
import osos.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:try:# Currently, memory growth needs to be the same across GPUsfor gpu in gpus:tf.config.experimental.set_memory_growth(gpu, True)logical_gpus = tf.config.experimental.list_logical_devices('GPU')print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")except RuntimeError as e:# Memory growth must be set before GPUs have been initializedprint(e)from utils import create_masksclass Translator:def __init__(self, tokenizer_zh, tokenize_en, model, MAX_SEQ_LENGTH):self.tokenizer_zh = tokenizer_zhself.tokenizer_en = tokenize_enself.model = modelself.MAX_SEQ_LENGTH = MAX_SEQ_LENGTHdef encode_zh(self, zh):tokens_zh = self.tokenizer_zh.tokenize(zh)lang1 = self.tokenizer_zh.convert_tokens_to_ids(['[CLS]'] + tokens_zh + ['[SEP]'])return lang1def evaluate(self, inp_sentence):# normalize input sentenceinp_sentence = self.encode_zh(inp_sentence)encoder_input = tf.expand_dims(inp_sentence, 0)# as the target is english, the first word to the transformer should be the# english start token.decoder_input = [self.tokenizer_en.vocab_size]output = tf.expand_dims(decoder_input, 0)for i in range(self.MAX_SEQ_LENGTH):enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)# predictions.shape == (batch_size, seq_len, vocab_size)predictions, attention_weights = self.model(encoder_input,output,False,enc_padding_mask,combined_mask,dec_padding_mask)# select the last word from the seq_len dimensionpredictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size)predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)# return the result if the predicted_id is equal to the end tokenif tf.equal(predicted_id, self.tokenizer_en.vocab_size + 1):return tf.squeeze(output, axis=0), attention_weights# concatentate the predicted_id to the output which is given to the decoder# as its input.output = tf.concat([output, predicted_id], axis=-1)return tf.squeeze(output, axis=0), attention_weightsdef plot_attention_weights(self, attention, sentence, result, layer):fig = plt.figure(figsize=(16, 8))sentence_ids = self.encode_zh(sentence)attention = tf.squeeze(attention[layer], axis=0)for head in range(attention.shape[0]):ax = fig.add_subplot(2, 4, head + 1)# plot the attention weightsax.matshow(attention[head][:-1, :], cmap='viridis')fontdict = {'fontsize': 10, 'family': 'DFKai-SB'}ax.set_xticks(range(len(sentence_ids)))ax.set_yticks(range(len(result)))ax.set_ylim(len(result) - 1.5, -0.5)ax.set_xticklabels(self.tokenizer_zh.convert_ids_to_tokens(sentence_ids),fontdict=fontdict, rotation=90)ax.set_yticklabels([self.tokenizer_en.decode([i]) for i in resultif i < self.tokenizer_en.vocab_size],fontdict=fontdict)ax.set_xlabel('Head {}'.format(head + 1))plt.tight_layout()plt.show()# In[ ]:def do(self, sentence, plot=''):result, attention_weights = self.evaluate(sentence)predicted_sentence = self.tokenizer_en.decode([i for i in resultif i < self.tokenizer_en.vocab_size])print('Chinese src: {}'.format(sentence))print('Translated : {}'.format(predicted_sentence))if plot:self.plot_attention_weights(attention_weights, sentence, result, plot)def main():# In[42]:sentence_ids = encode_zh("我爱你啊")print(tokenizer_zh.convert_ids_to_tokens(sentence_ids))# In[ ]:# In[51]:translate(transformer, '虽然继承了祖荫,但朴槿惠已经证明了自己是个机敏而老练的政治家——她历经20年才爬上韩国大国家党最高领导层并成为全国知名人物。')print('Real translation: While Park derives some of her power from her family pedigree, she has proven to be an astute and seasoned politician – one who climbed the Grand National Party’s leadership ladder over the last two decades to emerge as a national figure.')# In[59]:translate(transformer, "我爱你是一件幸福的事情。")# ## Save weights# In[ ]:transformer.save_weights('bert_nmt_ckpt')# In[49]:new_transformer = Transformer(config=config,target_vocab_size=target_vocab_size,bert_config_file=bert_config_file)fn_out, _ = new_transformer(inp, tar_inp,True,look_ahead_mask=None,dec_padding_mask=None)new_transformer.load_weights('bert_nmt_ckpt')translate(new_transformer, '我爱你')if __name__ == '__main__':main()
tokenizer.py
import tensorflow as tf
import tensorflow_datasets as tfdsimport collections
import unicodedata
import os,sys
import numpy as npdef convert_to_unicode(text):"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""if isinstance(text, str):return textelif isinstance(text, bytes):return text.decode("utf-8", "ignore")else:raise ValueError("Unsupported string type: %s" % (type(text)))def load_vocab(vocab_file):"""Loads a vocabulary file into a dictionary."""vocab = collections.OrderedDict()index = 0with tf.io.gfile.GFile(vocab_file, "r") as reader:while True:token = convert_to_unicode(reader.readline())if not token:breaktoken = token.strip()vocab[token] = indexindex += 1return vocabdef whitespace_tokenize(text):"""Runs basic whitespace cleaning and splitting on a piece of text."""text = text.strip()if not text:return []tokens = text.split()return tokensdef convert_by_vocab(vocab, items):"""Converts a sequence of [tokens|ids] using the vocab."""output = []for item in items:output.append(vocab[item])return outputclass FullTokenizer(object):"""Runs end-to-end tokenziation."""def __init__(self, vocab_file, do_lower_case=True):self.vocab = load_vocab(vocab_file)self.inv_vocab = {v: k for k, v in self.vocab.items()}self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)def tokenize(self, text):split_tokens = []for token in self.basic_tokenizer.tokenize(text):for sub_token in self.wordpiece_tokenizer.tokenize(token):split_tokens.append(sub_token)return split_tokensdef convert_tokens_to_ids(self, tokens):return convert_by_vocab(self.vocab, tokens)def convert_ids_to_tokens(self, ids):return convert_by_vocab(self.inv_vocab, ids)class BasicTokenizer(object):"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""def __init__(self, do_lower_case=True):"""Constructs a BasicTokenizer.Args:do_lower_case: Whether to lower case the input."""self.do_lower_case = do_lower_casedef tokenize(self, text):"""Tokenizes a piece of text."""text = convert_to_unicode(text)text = self._clean_text(text)# This was added on November 1st, 2018 for the multilingual and Chinese# models. This is also applied to the English models now, but it doesn't# matter since the English models were not trained on any Chinese data# and generally don't have any Chinese data in them (there are Chinese# characters in the vocabulary because Wikipedia does have some Chinese# words in the English Wikipedia.).text = self._tokenize_chinese_chars(text)orig_tokens = whitespace_tokenize(text)split_tokens = []for token in orig_tokens:if self.do_lower_case:token = token.lower()token = self._run_strip_accents(token)split_tokens.extend(self._run_split_on_punc(token))output_tokens = whitespace_tokenize(" ".join(split_tokens))return output_tokensdef _run_strip_accents(self, text):"""Strips accents from a piece of text."""text = unicodedata.normalize("NFD", text)output = []for char in text:cat = unicodedata.category(char)if cat == "Mn":continueoutput.append(char)return "".join(output)def _run_split_on_punc(self, text):"""Splits punctuation on a piece of text."""chars = list(text)i = 0start_new_word = Trueoutput = []while i < len(chars):char = chars[i]if _is_punctuation(char):output.append([char])start_new_word = Trueelse:if start_new_word:output.append([])start_new_word = Falseoutput[-1].append(char)i += 1return ["".join(x) for x in output]def _tokenize_chinese_chars(self, text):"""Adds whitespace around any CJK character."""output = []for char in text:cp = ord(char)if self._is_chinese_char(cp):output.append(" ")output.append(char)output.append(" ")else:output.append(char)return "".join(output)def _is_chinese_char(self, cp):"""Checks whether CP is the codepoint of a CJK character."""# This defines a "chinese character" as anything in the CJK Unicode block:# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)## Note that the CJK Unicode block is NOT all Japanese and Korean characters,# despite its name. The modern Korean Hangul alphabet is a different block,# as is Japanese Hiragana and Katakana. Those alphabets are used to write# space-separated words, so they are not treated specially and handled# like the all of the other languages.if ((cp >= 0x4E00 and cp <= 0x9FFF) or #(cp >= 0x3400 and cp <= 0x4DBF) or #(cp >= 0x20000 and cp <= 0x2A6DF) or #(cp >= 0x2A700 and cp <= 0x2B73F) or #(cp >= 0x2B740 and cp <= 0x2B81F) or #(cp >= 0x2B820 and cp <= 0x2CEAF) or(cp >= 0xF900 and cp <= 0xFAFF) or #(cp >= 0x2F800 and cp <= 0x2FA1F)): #return Truereturn Falsedef _clean_text(self, text):"""Performs invalid character removal and whitespace cleanup on text."""output = []for char in text:cp = ord(char)if cp == 0 or cp == 0xfffd or _is_control(char):continueif _is_whitespace(char):output.append(" ")else:output.append(char)return "".join(output)class WordpieceTokenizer(object):"""Runs WordPiece tokenziation."""def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):self.vocab = vocabself.unk_token = unk_tokenself.max_input_chars_per_word = max_input_chars_per_worddef tokenize(self, text):"""Tokenizes a piece of text into its word pieces.This uses a greedy longest-match-first algorithm to perform tokenizationusing the given vocabulary.For example:input = "unaffable"output = ["un", "##aff", "##able"]Args:text: A single token or whitespace separated tokens. This should havealready been passed through `BasicTokenizer.Returns:A list of wordpiece tokens."""text = convert_to_unicode(text)output_tokens = []for token in whitespace_tokenize(text):chars = list(token)if len(chars) > self.max_input_chars_per_word:output_tokens.append(self.unk_token)continueis_bad = Falsestart = 0sub_tokens = []while start < len(chars):end = len(chars)cur_substr = Nonewhile start < end:substr = "".join(chars[start:end])if start > 0:substr = "##" + substrif substr in self.vocab:cur_substr = substrbreakend -= 1if cur_substr is None:is_bad = Truebreaksub_tokens.append(cur_substr)start = endif is_bad:output_tokens.append(self.unk_token)else:output_tokens.extend(sub_tokens)return output_tokensdef _is_whitespace(char):"""Checks whether `chars` is a whitespace character."""# \t, \n, and \r are technically contorl characters but we treat them# as whitespace since they are generally considered as such.if char == " " or char == "\t" or char == "\n" or char == "\r":return Truecat = unicodedata.category(char)if cat == "Zs":return Truereturn Falsedef _is_control(char):"""Checks whether `chars` is a control character."""# These are technically control characters but we count them as whitespace# characters.if char == "\t" or char == "\n" or char == "\r":return Falsecat = unicodedata.category(char)if cat.startswith("C"):return Truereturn Falsedef _is_punctuation(char):"""Checks whether `chars` is a punctuation character."""cp = ord(char)# We treat all non-letter/number ASCII as punctuation.# Characters such as "^", "$", and "`" are not in the Unicode# Punctuation class but we treat them as punctuation anyways, for# consistency.if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):return Truecat = unicodedata.category(char)if cat.startswith("P"):return Truereturn Falsedef get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE):# ## Setup input pipleline# Use TFDS to load the wmt2019 zh-en translation dataset.if not os.path.exists('chinese_L-12_H-768_A-12'):# get_ipython().system('wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip')# get_ipython().system('unzip chinese_L-12_H-768_A-12')print('download pretrained first!')sys.exit()config = tfds.translate.wmt.WmtConfig(description="WMT 2019 translation task dataset.",version="0.0.3",language_pair=("zh", "en"),subsets={tfds.Split.TRAIN: ["newscommentary_v13"],tfds.Split.VALIDATION: ["newsdev2017"],})builder = tfds.builder("wmt_translate", config=config)print(builder.info.splits)builder.download_and_prepare()datasets = builder.as_dataset(as_supervised=True)print('datasets is {}'.format(datasets))# In[ ]:train_examples = datasets['train']val_examples = datasets['validation']# In[ ]:for zh, en in train_examples.take(1):# print((zh))print(tf.compat.as_text(zh.numpy()))print(tf.compat.as_text(en.numpy()))# Create a custom subwords tokenizer from the training dataset for the decoder.# In[ ]:vocab_file = 'vocab_en'if os.path.isfile(vocab_file + '.subwords'):tokenizer_en = tfds.features.text.SubwordTextEncoder.load_from_file(vocab_file)else:tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus((en.numpy() for zh, en in train_examples), target_vocab_size=2 ** 13)tokenizer_en.save_to_file('vocab_en')sample_string = 'Transformer is awesome.'tokenized_string = tokenizer_en.encode(sample_string)for ts in tokenized_string:print('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))# The encoder uses BERT tokenizer.# In[ ]:tokenizer_zh = FullTokenizer(vocab_file='chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=True)test_tokens = tokenizer_zh.tokenize('今天天气不错额。')test_ids = tokenizer_zh.convert_tokens_to_ids(['[CLS]'] + test_tokens + ['[SEP]'])print('tokens:', test_tokens)print('ids:', test_ids)print('convert_ids_to_tokens:', tokenizer_zh.convert_ids_to_tokens(test_ids))def encode(zh, en, seq_length=MAX_SEQ_LENGTH):tokens_zh = tokenizer_zh.tokenize(tf.compat.as_text(zh.numpy()))lang1 = tokenizer_zh.convert_tokens_to_ids(['[CLS]'] + tokens_zh + ['[SEP]'])if len(lang1) < seq_length:lang1 = lang1 + list(np.zeros(seq_length - len(lang1), 'int32'))# insert SOS and EOSlang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(tf.compat.as_text(en.numpy())) + [tokenizer_en.vocab_size + 1]if len(lang2) < seq_length:lang2 = lang2 + list(np.zeros(seq_length - len(lang2), 'int32'))return lang1, lang2def filter_max_length(x, y, max_length=MAX_SEQ_LENGTH):return tf.logical_and(tf.size(x) <= max_length,tf.size(y) <= max_length)train_dataset = train_examples.map(lambda zh, en: tf.py_function(encode, [zh, en], [tf.int32, tf.int32]))train_dataset = train_dataset.filter(filter_max_length)# cache the dataset to memory to get a speedup while reading from it.train_dataset = train_dataset.cache()train_dataset = train_dataset.shuffle(20000).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]), drop_remainder=True)train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)val_dataset = val_examples.map(lambda zh, en: tf.py_function(encode, [zh, en], [tf.int32, tf.int32]))val_dataset = val_dataset.filter(filter_max_length)val_dataset = val_dataset.padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))return train_dataset, val_dataset, tokenizer_en, tokenizer_zhif __name__ == '__main__':get_tokenizer(100, 64)
transformer.py
import tensorflow as tf
import numpy as npfrom utils import positional_encoding
from attlayer import EncoderLayer,DecoderLayerclass Encoder(tf.keras.layers.Layer):def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,rate=0.1):super(Encoder, self).__init__()self.d_model = d_modelself.num_layers = num_layersself.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)self.pos_encoding = positional_encoding(input_vocab_size, self.d_model)self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)for _ in range(num_layers)]self.dropout = tf.keras.layers.Dropout(rate)def call(self, x, training, mask):seq_len = tf.shape(x)[1]# adding embedding and position encoding.x = self.embedding(x) # (batch_size, input_seq_len, d_model)x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))x += self.pos_encoding[:, :seq_len, :]x = self.dropout(x, training=training)for i in range(self.num_layers):x = self.enc_layers[i](x, training, mask)return x # (batch_size, input_seq_len, d_model)"""### DecoderThe `Decoder` consists of:
1. Output Embedding
2. Positional Encoding
3. N decoder layersThe target is put through an embedding which is summed with the positional encoding. The output of this summation is the input to the decoder layers. The output of the decoder is the input to the final linear layer.
"""class Decoder(tf.keras.layers.Layer):def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,rate=0.1):super(Decoder, self).__init__()self.d_model = d_modelself.num_layers = num_layersself.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)for _ in range(num_layers)]self.dropout = tf.keras.layers.Dropout(rate)def call(self, x, enc_output, training,look_ahead_mask, padding_mask):seq_len = tf.shape(x)[1]attention_weights = {}x = self.embedding(x) # (batch_size, target_seq_len, d_model)x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))x += self.pos_encoding[:, :seq_len, :]x = self.dropout(x, training=training)for i in range(self.num_layers):x, block1, block2 = self.dec_layers[i](x, enc_output, training,look_ahead_mask, padding_mask)attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2# x.shape == (batch_size, target_seq_len, d_model)return x, attention_weights"""## Create the TransformerTransformer consists of the encoder, decoder and a final linear layer. The output of the decoder is the input to the linear layer and its output is returned.
"""class Transformer(tf.keras.Model):def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,target_vocab_size, rate=0.1):super(Transformer, self).__init__()self.encoder = Encoder(num_layers, d_model, num_heads, dff,input_vocab_size, rate)self.decoder = Decoder(num_layers, d_model, num_heads, dff,target_vocab_size, rate)self.final_layer = tf.keras.layers.Dense(target_vocab_size)def call(self, inp, tar, training, enc_padding_mask,look_ahead_mask, dec_padding_mask):enc_output = self.encoder(inp, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model)# dec_output.shape == (batch_size, tar_seq_len, d_model)dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size)return final_output, attention_weightsif __name__ == '__main__':sample_encoder = Encoder(num_layers=2, d_model=512, num_heads=8,dff=2048, input_vocab_size=8500)sample_encoder_output = sample_encoder(tf.random.uniform((64, 62)),training=False, mask=None)print(sample_encoder_output.shape) # (batch_size, input_seq_len, d_model)sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8,dff=2048, target_vocab_size=8000)output, attn = sample_decoder(tf.random.uniform((64, 26)),enc_output=sample_encoder_output,training=False, look_ahead_mask=None,padding_mask=None)output.shape, attn['decoder_layer2_block2'].shapesample_transformer = Transformer(num_layers=2, d_model=512, num_heads=8, dff=2048,input_vocab_size=8500, target_vocab_size=8000)temp_input = tf.random.uniform((64, 62))temp_target = tf.random.uniform((64, 26))fn_out, _ = sample_transformer(temp_input, temp_target, training=False,enc_padding_mask=None,look_ahead_mask=None,dec_padding_mask=None)fn_out.shape # (batch_size, tar_seq_len, target_vocab_size)
transformer_train.py
import tensorflow as tfimport time
import numpy as np
import matplotlib.pyplot as plt
import osos.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:try:# Currently, memory growth needs to be the same across GPUsfor gpu in gpus:tf.config.experimental.set_memory_growth(gpu, True)logical_gpus = tf.config.experimental.list_logical_devices('GPU')print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")except RuntimeError as e:# Memory growth must be set before GPUs have been initializedprint(e)from tokenizer import get_tokenizer
from transformer import Transformer
from utils import CustomSchedule, create_masks
from test import TranslatorBUFFER_SIZE = 20000
BATCH_SIZE = 64
MAX_SEQ_LENGTH = 128train_dataset, val_dataset, tokenizer_en, tokenizer_zh = \get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE)# Chinese -> English translation
input_vocab_size = 21128
target_vocab_size = tokenizer_en.vocab_size + 2
dropout_rate = 0.1
num_layers=4
d_model=512
dff=2048
num_heads=8transformer = Transformer(num_layers, d_model, num_heads, dff,input_vocab_size, target_vocab_size, dropout_rate)inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))fn_out, _ = transformer(inp, tar_inp,True,enc_padding_mask=None,look_ahead_mask=None,dec_padding_mask=None)
print(tar_inp.shape) # (batch_size, tar_seq_len)
print(fn_out.shape) # (batch_size, tar_seq_len, target_vocab_size)
transformer.summary()learning_rate = CustomSchedule(d_model)optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,epsilon=1e-9)loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')def loss_function(real, pred):mask = tf.math.logical_not(tf.math.equal(real, 0))loss_ = loss_object(real, pred)mask = tf.cast(mask, dtype=loss_.dtype)loss_ *= maskreturn tf.reduce_mean(loss_)train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')checkpoint_path = "./zh-en/transformer"ckpt = tf.train.Checkpoint(transformer=transformer,optimizer=optimizer)ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:ckpt.restore(ckpt_manager.latest_checkpoint)print('Latest checkpoint restored!!')@tf.function
def train_step(inp, tar):tar_inp = tar[:, :-1]tar_real = tar[:, 1:]enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)with tf.GradientTape() as tape:predictions, _ = transformer(inp, tar_inp,True,enc_padding_mask,combined_mask,dec_padding_mask)loss = loss_function(tar_real, predictions)gradients = tape.gradient(loss, transformer.trainable_variables)optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))train_loss(loss)train_accuracy(tar_real, predictions)# Chinese is used as the input language and English is the target language.
translator = Translator(tokenizer_zh, tokenizer_en, transformer, MAX_SEQ_LENGTH)for epoch in range(20):(cn_code, en_code) = next(iter(val_dataset))cn_code, en_code = cn_code[epoch].numpy(), en_code[epoch].numpy()# print(cn_code)# print(en_code)en = tokenizer_en.decode([i for i in en_code if i < tokenizer_en.vocab_size])cn_code = [int(i)for i in cn_code if (i!=101 and i!=102 and i!=1 and i!=0)]# print(cn_code)cn = tokenizer_zh.convert_ids_to_tokens(cn_code)cn = "".join(cn)translator.do(cn)print('Real:', en)print('\n')start = time.time()train_loss.reset_states()train_accuracy.reset_states()# inp -> chinese, tar -> englishfor (batch, (inp, tar)) in enumerate(train_dataset):train_step(inp, tar)if batch % 50 == 0:print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result()))if (epoch + 1) % 3 == 0:ckpt_save_path = ckpt_manager.save()print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,ckpt_save_path))print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,train_loss.result(),train_accuracy.result()))print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
utils.py
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as pltdef get_angles(pos, i, d_model):angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))return pos * angle_ratesdef positional_encoding(position, d_model):angle_rads = get_angles(np.arange(position)[:, np.newaxis],np.arange(d_model)[np.newaxis, :],d_model)# apply sin to even indices in the array; 2isines = np.sin(angle_rads[:, 0::2])# apply cos to odd indices in the array; 2i+1cosines = np.cos(angle_rads[:, 1::2])pos_encoding = np.concatenate([sines, cosines], axis=-1)pos_encoding = pos_encoding[np.newaxis, ...]return tf.cast(pos_encoding, dtype=tf.float32)# ## Masking# Mask all the pad tokens in the batch of sequence. It ensures that the model does not treat padding as the input. The mask indicates where pad value 0 is present: it outputs a 1 at those locations, and a 0 otherwise.# In[ ]:def create_padding_mask(seq):seq = tf.cast(tf.math.equal(seq, 0), tf.float32)# add extra dimensions so that we can add the padding# to the attention logits.return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)# The look-ahead mask is used to mask the future tokens in a sequence. In other words, the mask indicates which entries should not be used.
#
# This means that to predict the third word, only the first and second word will be used. Similarly to predict the fourth word, only the first, second and the third word will be used and so on.# In[ ]:def create_look_ahead_mask(size):mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)return mask # (seq_len, seq_len)def create_masks(inp, tar):# Encoder padding maskenc_padding_mask = create_padding_mask(inp)# Used in the 2nd attention block in the decoder.# This padding mask is used to mask the encoder outputs.dec_padding_mask = create_padding_mask(inp)# Used in the 1st attention block in the decoder.# It is used to pad and mask future tokens in the input received by# the decoder.look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])dec_target_padding_mask = create_padding_mask(tar)combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)return enc_padding_mask, combined_mask, dec_padding_mask# def create_masks2(inp, tar):
# # Used in the 2nd attention block in the decoder.
# # This padding mask is used to mask the encoder outputs.
# dec_padding_mask = create_padding_mask(inp)
#
# # Used in the 1st attention block in the decoder.
# # It is used to pad and mask future tokens in the input received by
# # the decoder.
# look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
# dec_target_padding_mask = create_padding_mask(tar)
# combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
#
# return combined_mask, dec_padding_maskclass CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):def __init__(self, d_model, warmup_steps=4000):super(CustomSchedule, self).__init__()self.d_model = d_modelself.d_model = tf.cast(self.d_model, tf.float32)self.warmup_steps = warmup_stepsdef __call__(self, step):arg1 = tf.math.rsqrt(step)arg2 = step * (self.warmup_steps ** -1.5)return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)def main():# In[ ]:temp_learning_rate_schedule = CustomSchedule(config.d_model)plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))plt.ylabel("Learning Rate")plt.xlabel("Train Step")if __name__ == '__main__':main()
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

