# K-fold Validation PorkCNN

author: davidycliao(David Yen-Chieh Liao) 
email: davidycliao@gmail.com 
date: 9-July-2021 

-------------------------

### Stage 1: Libaries & Dependencies

In [1]:
# built-in library
import math
import re
import collections
import zipfile
import random
from itertools import chain

# ML & Deep Learning/ NLP toolkit
import pandas as pd
import numpy as np
import jieba
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

-------------------------

### Stage 2: Data Preprocessing (Training Data: Introduction of Bills and Legislation from 6th Session to 7th Session, 2004-2012)

#### (1) Read file 

In [2]:
# read file
df = pd.read_csv('data/Pork Bill - 2021-05-20.csv',encoding='utf-8')

# combine abstract of bill and title 
df['text'] = df['Title'] + df['Content'].fillna(df['Title'])

# drop conten without having any characters
# view na's row: df[df['text'].isnull()==True]
data = df[['text', 'pork_bill']].dropna(subset=['text'])

In [3]:
print(" Pork Legislation", data['pork_bill'].value_counts()[1],'\n', 
 "None-Pork Legislation", data['pork_bill'].value_counts()[0])

 Pork Legislation 2510 
 None-Pork Legislation 4733


#### (2) Tokenization

In [4]:
import collections
import numpy as np
import jieba
from itertools import chain


def jieba_cut(filename):
 """
 cut Chinese and remove stop words
 Reference: https://www.cnblogs.com/Luv-GEM/p/10836454.html
 Stopwords: https://www.kaggle.com/rikdifos/english-and-chinese-stopwords?select=cn_stopwords.txt
 """
 stop_list = [i.strip() for i in open('cn_stopwords.txt','r',encoding='utf-8')] 
 news_cut = []
 news_list = []
 for line in filename: 
 if line:
 news_cut = list(jieba.cut(''.join(line),cut_all=False,HMM=True)) 
 news_list.append([word.strip() for word in news_cut if word not in stop_list and len(word.strip())>0]) 
 news_list = list(chain.from_iterable(news_list)) 
 return news_list

def clearPucts(context):
 """
 remove punctuation
 ref: https://chenyuzuoo.github.io/posts/28001/
 """
 context = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", context)
 context = re.sub("[【】╮╯▽╰╭★→「」]+","", context)
 context = re.sub("!,❤。~《》:()【】「」?”“;:、","",context)
 context = re.sub("\s","",context)
 return context

def seg_char(sent):
 """
 cut Chinese and remove stop words
 ref: https://blog.csdn.net/renyuanfang/article/details/86487367
 """
 # split
 pattern_char_1 = re.compile(r'([\W])')
 parts = pattern_char_1.split(sent)
 parts = [p for p in parts if len(p.strip())>0]
 # cut sentence
 pattern = re.compile(r'([\u4e00-\u9fa5])')
 chars = pattern.split(sent)
 chars = [w for w in chars if len(w.strip())>0]
 chars = ' '.join(chars)
 return chars

In [5]:
data_clean = [seg_char(text) for text in [clearPucts(text) for text in data.text]]

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
 data_clean, target_vocab_size=2**18)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

#### (3) Padding

In [6]:
MAX_LEN = max([len(sentence) for sentence in data_clean])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
 value=0,
 padding="post",
 maxlen=MAX_LEN)

data_labels = data.pork_bill.values
#print('Maximun length:{} \nInput:{}'.format(MAX_LEN, data_inputs.shape[0]))


#### (4) Spliting Training / Testing Set

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
train_data, test_data, train_targets, test_targets = train_test_split(
 data_inputs, data_labels, test_size=0.33, random_state=42)

print("Shape of X Train:", train_data.shape, '\n'
 "Shape of X Test :", test_data.shape,'\n'
 "Shape of Y Trian:", train_targets.shape , '\n'
 "Shape of Y Test :", test_targets.shape )

Shape of X Train: (4852, 785) 
Shape of X Test : (2391, 785) 
Shape of Y Trian: (4852,) 
Shape of Y Test : (2391,)


-------------------------


### Stage 3: Model and Building

#### (1) Using the Subclassing API to Build Dynamic Model

In [8]:
class DCNN(tf.keras.Model):
 def __init__(self,
 vocab_size,
 emb_dim=128,
 nb_filters=100,
 # units: Positive integer, dimensionality of the output space.
 FFN_units=512,
 nb_classes=2,
 dropout_rate=0.1,
 training=False,
 name="PorkCNN"):
 super(DCNN, self).__init__(name=name)
 self.embedding = layers.Embedding(vocab_size, emb_dim)
 self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, strides = 1, padding="valid", activation="relu")
 self.bigram2 = layers.Conv1D(filters=nb_filters, kernel_size=2, strides = 2, padding="valid", activation="relu")
 self.trigram = layers.Conv1D(filters=nb_filters,kernel_size=3,strides = 1, padding="valid",activation="relu")
 self.trigram2 = layers.Conv1D(filters=nb_filters,kernel_size=3,strides = 2, padding="valid",activation="relu")
 self.fourgram = layers.Conv1D(filters=nb_filters,kernel_size=4,strides = 2, padding="valid",activation="relu")
 self.fivegram = layers.Conv1D(filters=nb_filters,kernel_size=5,strides = 2, padding="valid",activation="relu")
 self.pool = layers.GlobalMaxPool1D()
 self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
 self.dropout = layers.Dropout(rate=dropout_rate)
 self.last_dense = layers.Dense(units=1, activation="sigmoid") 
 def call(self, inputs, training):
 x = self.embedding(inputs)
 x_1 = self.bigram(x)
 x_1 = self.pool(x_1)
 x_1_1 = self.bigram2(x)
 x_1_1 = self.pool(x_1_1) 
 x_2 = self.trigram(x)
 x_2 = self.pool(x_2)
 x_2_1 = self.trigram2(x)
 x_2_1 = self.pool(x_2_1) 
 x_3 = self.fourgram(x)
 x_3 = self.pool(x_3)
 x_4 = self.fourgram(x)
 x_4 = self.pool(x_4) 
 x_5 = self.fivegram(x)
 x_5 = self.pool(x_5) 
 merged = tf.concat([x_1,x_1_1, x_2,x_2_1, x_3, x_4, x_5], axis=-1) 
 merged = self.dense_1(merged)
 merged = self.dropout(merged, training)
 output = self.last_dense(merged) 
 return output

In [9]:
VOCAB_SIZE = tokenizer.vocab_size #tokenizer.vocab_size # 5000 tokenizer.vocab_size
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2 #len(set(train_labels))
DROPOUT_RATE = 0.25
BATCH_SIZE = 230
NB_EPOCHS = 80

Dcnn = DCNN(vocab_size=VOCAB_SIZE,
 emb_dim=EMB_DIM,
 nb_filters=NB_FILTERS,
 FFN_units=FFN_UNITS,
 nb_classes=NB_CLASSES,
 dropout_rate=DROPOUT_RATE)

#### (2) Compile and Summary of the Model

In [10]:
Dcnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
Dcnn.build(input_shape = (train_data.shape[1], EMB_DIM)) # (train_inputs.shape[1] , EMB_DIM) (785 , EMB_DIM)
Dcnn.summary()

Model: "PorkCNN"
_________________________________________________________________
Layer (type) Output Shape Param # 
embedding (Embedding) multiple 586600 
_________________________________________________________________
conv1d (Conv1D) multiple 40100 
_________________________________________________________________
conv1d_1 (Conv1D) multiple 40100 
_________________________________________________________________
conv1d_2 (Conv1D) multiple 60100 
_________________________________________________________________
conv1d_3 (Conv1D) multiple 60100 
_________________________________________________________________
conv1d_4 (Conv1D) multiple 80100 
_________________________________________________________________
conv1d_5 (Conv1D) multiple 100100 
_________________________________________________________________
global_max_pooling1d (Global multiple 0 
_________________________________________________________________
dense (Dense) multiple 179456 
________________________________________

-------------------------

### Stage 4: K-fold Validation 

#### (1) Loss & Accuracy

Code Reference: François Chollet, Deep Learning with Python, 4.3.4, 2020

In [None]:
k=5
num_val_samples = len(train_data) // k 
num_epochs = 8
batch_size = 230
all_loss = []
all_accuracy = []
for i in range(k):
 print('processing fold #%d' % i)
 val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
 val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]
 partial_train_data = np.concatenate(
 [train_data[:i * num_val_samples],
 train_data[(i + 1) * num_val_samples:]],
 axis=0)
 partial_train_targets = np.concatenate(
 [train_targets[:i * num_val_samples],
 train_targets[(i + 1) * num_val_samples:]],
 axis=0)
 Dcnn.fit(partial_train_data, partial_train_targets,
 epochs=num_epochs,
 validation_data=(val_data, val_targets),
 batch_size=batch_size,
 callbacks=[early_stop], 
 verbose=1)
 loss, accuracy = Dcnn.evaluate(val_data, val_targets, verbose=0, batch_size=batch_size)
 all_accuracy.append(accuracy)
 all_loss.append(loss)

In [None]:
np.mean(all_accuracy)

### Stage 5: Storing the Validation Logs 

In [None]:
k=5
num_val_samples = len(train_data) // k 
num_epochs = 10
batch_size = 230
early_stop = EarlyStopping(monitor='val_loss',patience=1, verbose=1)
loss = [] 
accuracy = []
val_loss = []
val_accuracy = []
df= pd.DataFrame()
for i in range(k):
 print('processing fold #%d' % i)
 val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
 val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]
 partial_train_data = np.concatenate(
 [train_data[:i * num_val_samples],
 train_data[(i + 1) * num_val_samples:]],
 axis=0)
 partial_train_targets = np.concatenate(
 [train_targets[:i * num_val_samples],
 train_targets[(i + 1) * num_val_samples:]],
 axis=0)
 history = Dcnn.fit(partial_train_data, 
 partial_train_targets,
 epochs=num_epochs,
 validation_data=(val_data, val_targets),
 batch_size=batch_size,
 #callbacks=[early_stop], 
 verbose=0)
 loss = history.history['loss']
 accuracy = history.history['accuracy']
 val_loss = history.history['val_loss']
 val_accuracy = history.history['val_accuracy']

In [None]:
#Building the history of successive mean K-fold validation scores
average_mae_history = [np.mean([x[i] for x in accuracy]) for i in range(num_epochs)] 
 

### Stage 6: Training the Final Model

In [None]:
Dcnn.fit(train_data, train_targets,
 batch_size=BATCH_SIZE,
 epochs=7)

In [None]:
evaluation_model = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(evaluation_model)

In [None]:
train_data, test_data, train_targets, test_targets

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

# pd.DataFrame(confusion_matrix(test_labels,predictions))
predictions = Dcnn.predict(test_data)
predictions = np.where(predictions >0.8 , 1, 0)
print(classification_report(test_targets, predictions))

In [None]:
t = pd.DataFrame(confusion_matrix(test_targets,predictions), 
 columns=['Predictions: Not Pork(0)','Predictions:Pork(1)'])
t.index = ['Acutal: Not Pork(0)', 'Acutal: Pork (1)']
t