{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b65dfbdc-bb80-4bd4-8248-dd06483222cd",
   "metadata": {},
   "source": [
    "# K-fold Validation PorkCNN\n",
    "\n",
    "author: davidycliao(David Yen-Chieh Liao)                                                                      \n",
    "email: davidycliao@gmail.com                                                             \n",
    "date: 9-July-2021    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8005d5a-f938-4c26-86d5-592408152d08",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "\n",
    "### Stage 1: Libaries & Dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6ca92ba3-7480-418f-91e0-d2beea2d10af",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T22:59:53.467113Z",
     "iopub.status.busy": "2021-07-08T22:59:53.466883Z",
     "iopub.status.idle": "2021-07-08T22:59:59.014730Z",
     "shell.execute_reply": "2021-07-08T22:59:59.014249Z",
     "shell.execute_reply.started": "2021-07-08T22:59:53.467088Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# built-in library\n",
    "import math\n",
    "import re\n",
    "import collections\n",
    "import zipfile\n",
    "import random\n",
    "from itertools import chain\n",
    "\n",
    "# ML & Deep Learning/ NLP toolkit\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import jieba\n",
    "from sklearn.model_selection import train_test_split\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras import layers\n",
    "import tensorflow_datasets as tfds\n",
    "from tensorflow.keras.callbacks import EarlyStopping,TensorBoard\n",
    "\n",
    "# Visualization\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bd77e05-355a-422f-87db-2cf77cba3bdd",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "\n",
    "### Stage 2: Data Preprocessing (Training Data: Introduction of Bills and Legislation from 6th Session to 7th Session, 2004-2012)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b57b9c9-443e-4e26-bda2-15f6877845d5",
   "metadata": {},
   "source": [
    "#### (1) Read file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e1973cfb-73d4-4717-a67a-a4e8a0c9edb0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T23:00:21.285359Z",
     "iopub.status.busy": "2021-07-08T23:00:21.285192Z",
     "iopub.status.idle": "2021-07-08T23:00:21.327935Z",
     "shell.execute_reply": "2021-07-08T23:00:21.327224Z",
     "shell.execute_reply.started": "2021-07-08T23:00:21.285341Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# read file\n",
    "df = pd.read_csv('data/Pork Bill - 2021-05-20.csv',encoding='utf-8')\n",
    "\n",
    "# combine abstract of bill and title \n",
    "df['text'] =  df['Title'] + df['Content'].fillna(df['Title'])\n",
    "\n",
    "# drop conten without having any characters\n",
    "# view na's row: df[df['text'].isnull()==True]\n",
    "data = df[['text', 'pork_bill']].dropna(subset=['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f6160785-9522-4404-8f77-6df941869cde",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T23:00:21.620052Z",
     "iopub.status.busy": "2021-07-08T23:00:21.619859Z",
     "iopub.status.idle": "2021-07-08T23:00:21.624658Z",
     "shell.execute_reply": "2021-07-08T23:00:21.624057Z",
     "shell.execute_reply.started": "2021-07-08T23:00:21.620031Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Pork Legislation 2510 \n",
      " None-Pork Legislation 4733\n"
     ]
    }
   ],
   "source": [
    "print(\" Pork Legislation\", data['pork_bill'].value_counts()[1],'\\n', \n",
    "      \"None-Pork Legislation\", data['pork_bill'].value_counts()[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e6d677d-e200-40e6-9252-130eb02fc682",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-16T17:01:03.001989Z",
     "iopub.status.busy": "2021-05-16T17:01:03.001801Z",
     "iopub.status.idle": "2021-05-16T17:01:03.004600Z",
     "shell.execute_reply": "2021-05-16T17:01:03.003789Z",
     "shell.execute_reply.started": "2021-05-16T17:01:03.001969Z"
    }
   },
   "source": [
    "#### (2) Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "120e8b54-7f75-4ca6-a0d0-9872b1522e8e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T23:00:21.966144Z",
     "iopub.status.busy": "2021-07-08T23:00:21.965937Z",
     "iopub.status.idle": "2021-07-08T23:00:21.972829Z",
     "shell.execute_reply": "2021-07-08T23:00:21.972242Z",
     "shell.execute_reply.started": "2021-07-08T23:00:21.966122Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import collections\n",
    "import numpy as np\n",
    "import jieba\n",
    "from itertools import chain\n",
    "\n",
    "\n",
    "def jieba_cut(filename):\n",
    "    \"\"\"\n",
    "    cut Chinese and remove stop words\n",
    "    Reference: https://www.cnblogs.com/Luv-GEM/p/10836454.html\n",
    "    Stopwords: https://www.kaggle.com/rikdifos/english-and-chinese-stopwords?select=cn_stopwords.txt\n",
    "    \"\"\"\n",
    "    stop_list = [i.strip() for i in open('cn_stopwords.txt','r',encoding='utf-8')]  \n",
    "    news_cut = []\n",
    "    news_list = []\n",
    "    for line in filename:    \n",
    "        if line:\n",
    "            news_cut = list(jieba.cut(''.join(line),cut_all=False,HMM=True))  \n",
    "            news_list.append([word.strip() for word in news_cut if word not in stop_list and len(word.strip())>0]) \n",
    "    news_list = list(chain.from_iterable(news_list))  \n",
    "    return news_list\n",
    "\n",
    "def clearPucts(context):\n",
    "    \"\"\"\n",
    "    remove punctuation\n",
    "    ref: https://chenyuzuoo.github.io/posts/28001/\n",
    "    \"\"\"\n",
    "    context = re.sub(\"[\\s+\\.\\!\\/_,$%^*(+\\\"\\']+|[+——！，。？、~@#￥%……&*（）]+\", \"\", context)\n",
    "    context = re.sub(\"[【】╮╯▽╰╭★→「」]+\",\"\", context)\n",
    "    context = re.sub(\"！，❤。～《》：（）【】「」？”“；：、\",\"\",context)\n",
    "    context = re.sub(\"\\s\",\"\",context)\n",
    "    return context\n",
    "\n",
    "def seg_char(sent):\n",
    "    \"\"\"\n",
    "    cut Chinese and remove stop words\n",
    "    ref: https://blog.csdn.net/renyuanfang/article/details/86487367\n",
    "    \"\"\"\n",
    "    # split\n",
    "    pattern_char_1 = re.compile(r'([\\W])')\n",
    "    parts = pattern_char_1.split(sent)\n",
    "    parts = [p for p in parts if len(p.strip())>0]\n",
    "    # cut sentence\n",
    "    pattern = re.compile(r'([\\u4e00-\\u9fa5])')\n",
    "    chars = pattern.split(sent)\n",
    "    chars = [w for w in chars if len(w.strip())>0]\n",
    "    chars = ' '.join(chars)\n",
    "    return chars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "25f4277c-001d-47a4-81d0-f5ac5ec5e5d3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T23:00:22.140770Z",
     "iopub.status.busy": "2021-07-08T23:00:22.140512Z",
     "iopub.status.idle": "2021-07-08T23:00:24.043746Z",
     "shell.execute_reply": "2021-07-08T23:00:24.043162Z",
     "shell.execute_reply.started": "2021-07-08T23:00:22.140740Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "data_clean = [seg_char(text) for text in [clearPucts(text) for text in data.text]]\n",
    "\n",
    "tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(\n",
    "    data_clean, target_vocab_size=2**18)\n",
    "\n",
    "data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "089e65af-e49c-4d60-962e-a0a826809aa5",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-16T17:01:47.611294Z",
     "iopub.status.busy": "2021-05-16T17:01:47.611020Z",
     "iopub.status.idle": "2021-05-16T17:01:47.614208Z",
     "shell.execute_reply": "2021-05-16T17:01:47.613487Z",
     "shell.execute_reply.started": "2021-05-16T17:01:47.611276Z"
    }
   },
   "source": [
    "#### (3) Padding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b801d231-972c-47e3-83c4-7375c2e11a18",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T23:00:24.075840Z",
     "iopub.status.busy": "2021-07-08T23:00:24.075642Z",
     "iopub.status.idle": "2021-07-08T23:00:24.141494Z",
     "shell.execute_reply": "2021-07-08T23:00:24.140925Z",
     "shell.execute_reply.started": "2021-07-08T23:00:24.075816Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "MAX_LEN = max([len(sentence) for sentence in data_clean])\n",
    "data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,\n",
    "                                                            value=0,\n",
    "                                                            padding=\"post\",\n",
    "                                                            maxlen=MAX_LEN)\n",
    "\n",
    "data_labels = data.pork_bill.values\n",
    "#print('Maximun length:{} \\nInput:{}'.format(MAX_LEN, data_inputs.shape[0]))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0346b3fa-dd60-4662-9af2-83e932d09d3c",
   "metadata": {},
   "source": [
    "#### (4) Spliting Training / Testing Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "383bf84d-b61d-45e3-89bd-64c908efbf6c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T23:00:24.142886Z",
     "iopub.status.busy": "2021-07-08T23:00:24.142722Z",
     "iopub.status.idle": "2021-07-08T23:00:24.156188Z",
     "shell.execute_reply": "2021-07-08T23:00:24.155525Z",
     "shell.execute_reply.started": "2021-07-08T23:00:24.142865Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of X Train: (4852, 785) \n",
      "Shape of X Test : (2391, 785) \n",
      "Shape of Y Trian: (4852,) \n",
      "Shape of Y Test : (2391,)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "train_data, test_data, train_targets, test_targets = train_test_split(\n",
    "    data_inputs, data_labels, test_size=0.33, random_state=42)\n",
    "\n",
    "print(\"Shape of X Train:\", train_data.shape, '\\n'\n",
    "      \"Shape of X Test :\", test_data.shape,'\\n'\n",
    "      \"Shape of Y Trian:\", train_targets.shape , '\\n'\n",
    "      \"Shape of Y Test :\", test_targets.shape )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "725f7feb-b93b-4d0b-bcbf-562240a79f8b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-16T17:04:13.375644Z",
     "iopub.status.busy": "2021-05-16T17:04:13.375440Z",
     "iopub.status.idle": "2021-05-16T17:04:13.378111Z",
     "shell.execute_reply": "2021-05-16T17:04:13.377464Z",
     "shell.execute_reply.started": "2021-05-16T17:04:13.375623Z"
    },
    "tags": []
   },
   "source": [
    "-------------------------\n",
    "\n",
    "\n",
    "### Stage 3: Model and Building"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a0e74bf-eaf4-4043-8d23-88291d8ea344",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-16T17:13:18.807725Z",
     "iopub.status.busy": "2021-05-16T17:13:18.807498Z",
     "iopub.status.idle": "2021-05-16T17:13:18.810574Z",
     "shell.execute_reply": "2021-05-16T17:13:18.809625Z",
     "shell.execute_reply.started": "2021-05-16T17:13:18.807701Z"
    }
   },
   "source": [
    "#### (1) Using the Subclassing API to Build Dynamic Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1f18f9b1-5f4d-4d0a-baf4-2eebed7f34f2",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T23:00:25.614474Z",
     "iopub.status.busy": "2021-07-08T23:00:25.614219Z",
     "iopub.status.idle": "2021-07-08T23:00:25.624817Z",
     "shell.execute_reply": "2021-07-08T23:00:25.624119Z",
     "shell.execute_reply.started": "2021-07-08T23:00:25.614446Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "class DCNN(tf.keras.Model):\n",
    "    def __init__(self,\n",
    "                 vocab_size,\n",
    "                 emb_dim=128,\n",
    "                 nb_filters=100,\n",
    "                 # units: Positive integer, dimensionality of the output space.\n",
    "                 FFN_units=512,\n",
    "                 nb_classes=2,\n",
    "                 dropout_rate=0.1,\n",
    "                 training=False,\n",
    "                 name=\"PorkCNN\"):\n",
    "        super(DCNN, self).__init__(name=name)\n",
    "        self.embedding = layers.Embedding(vocab_size, emb_dim)\n",
    "        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, strides = 1, padding=\"valid\", activation=\"relu\")\n",
    "        self.bigram2 = layers.Conv1D(filters=nb_filters, kernel_size=2, strides = 2, padding=\"valid\", activation=\"relu\")\n",
    "        self.trigram = layers.Conv1D(filters=nb_filters,kernel_size=3,strides = 1, padding=\"valid\",activation=\"relu\")\n",
    "        self.trigram2 = layers.Conv1D(filters=nb_filters,kernel_size=3,strides = 2, padding=\"valid\",activation=\"relu\")\n",
    "        self.fourgram = layers.Conv1D(filters=nb_filters,kernel_size=4,strides = 2, padding=\"valid\",activation=\"relu\")\n",
    "        self.fivegram = layers.Conv1D(filters=nb_filters,kernel_size=5,strides = 2, padding=\"valid\",activation=\"relu\")\n",
    "        self.pool = layers.GlobalMaxPool1D()\n",
    "        self.dense_1 = layers.Dense(units=FFN_units, activation=\"relu\")\n",
    "        self.dropout = layers.Dropout(rate=dropout_rate)\n",
    "        self.last_dense = layers.Dense(units=1, activation=\"sigmoid\")    \n",
    "    def call(self, inputs, training):\n",
    "        x = self.embedding(inputs)\n",
    "        x_1 = self.bigram(x)\n",
    "        x_1 = self.pool(x_1)\n",
    "        x_1_1 = self.bigram2(x)\n",
    "        x_1_1 = self.pool(x_1_1)        \n",
    "        x_2 = self.trigram(x)\n",
    "        x_2 = self.pool(x_2)\n",
    "        x_2_1 = self.trigram2(x)\n",
    "        x_2_1 = self.pool(x_2_1)     \n",
    "        x_3 = self.fourgram(x)\n",
    "        x_3 = self.pool(x_3)\n",
    "        x_4 = self.fourgram(x)\n",
    "        x_4 = self.pool(x_4) \n",
    "        x_5 = self.fivegram(x)\n",
    "        x_5 = self.pool(x_5)      \n",
    "        merged = tf.concat([x_1,x_1_1, x_2,x_2_1, x_3, x_4, x_5], axis=-1) \n",
    "        merged = self.dense_1(merged)\n",
    "        merged = self.dropout(merged, training)\n",
    "        output = self.last_dense(merged)        \n",
    "        return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bfc02e93-3a0e-4c1c-b488-b2db98b53824",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T23:00:26.106352Z",
     "iopub.status.busy": "2021-07-08T23:00:26.106099Z",
     "iopub.status.idle": "2021-07-08T23:00:26.128713Z",
     "shell.execute_reply": "2021-07-08T23:00:26.128071Z",
     "shell.execute_reply.started": "2021-07-08T23:00:26.106324Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "VOCAB_SIZE = tokenizer.vocab_size #tokenizer.vocab_size  # 5000 tokenizer.vocab_size\n",
    "EMB_DIM = 200\n",
    "NB_FILTERS = 100\n",
    "FFN_UNITS = 256\n",
    "NB_CLASSES = 2 #len(set(train_labels))\n",
    "DROPOUT_RATE = 0.25\n",
    "BATCH_SIZE = 230\n",
    "NB_EPOCHS = 80\n",
    "\n",
    "Dcnn = DCNN(vocab_size=VOCAB_SIZE,\n",
    "            emb_dim=EMB_DIM,\n",
    "            nb_filters=NB_FILTERS,\n",
    "            FFN_units=FFN_UNITS,\n",
    "            nb_classes=NB_CLASSES,\n",
    "            dropout_rate=DROPOUT_RATE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b0ec8f51-6ed8-49fa-9d5e-c6929184bdf1",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-16T18:43:59.279424Z",
     "iopub.status.busy": "2021-05-16T18:43:59.279202Z",
     "iopub.status.idle": "2021-05-16T18:43:59.282718Z",
     "shell.execute_reply": "2021-05-16T18:43:59.281823Z",
     "shell.execute_reply.started": "2021-05-16T18:43:59.279399Z"
    }
   },
   "source": [
    "#### (2) Compile and Summary of the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "2cce0414-05eb-4f79-9c27-bca9c91a2903",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-07-08T23:00:28.471734Z",
     "iopub.status.busy": "2021-07-08T23:00:28.471481Z",
     "iopub.status.idle": "2021-07-08T23:00:28.552868Z",
     "shell.execute_reply": "2021-07-08T23:00:28.552157Z",
     "shell.execute_reply.started": "2021-07-08T23:00:28.471706Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"PorkCNN\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding (Embedding)        multiple                  586600    \n",
      "_________________________________________________________________\n",
      "conv1d (Conv1D)              multiple                  40100     \n",
      "_________________________________________________________________\n",
      "conv1d_1 (Conv1D)            multiple                  40100     \n",
      "_________________________________________________________________\n",
      "conv1d_2 (Conv1D)            multiple                  60100     \n",
      "_________________________________________________________________\n",
      "conv1d_3 (Conv1D)            multiple                  60100     \n",
      "_________________________________________________________________\n",
      "conv1d_4 (Conv1D)            multiple                  80100     \n",
      "_________________________________________________________________\n",
      "conv1d_5 (Conv1D)            multiple                  100100    \n",
      "_________________________________________________________________\n",
      "global_max_pooling1d (Global multiple                  0         \n",
      "_________________________________________________________________\n",
      "dense (Dense)                multiple                  179456    \n",
      "_________________________________________________________________\n",
      "dropout (Dropout)            multiple                  0         \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              multiple                  257       \n",
      "=================================================================\n",
      "Total params: 1,146,913\n",
      "Trainable params: 1,146,913\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "Dcnn.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n",
    "Dcnn.build(input_shape = (train_data.shape[1], EMB_DIM)) # (train_inputs.shape[1] , EMB_DIM)  (785 , EMB_DIM)\n",
    "Dcnn.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b08770a-d76c-4573-bd24-9984070cc5ba",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "\n",
    "### Stage 4: K-fold Validation "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e4123381-f66a-4170-91c6-c2911c0daec1",
   "metadata": {},
   "source": [
    "#### (1) Loss & Accuracy\n",
    "\n",
    "Code Reference: François Chollet, Deep Learning with Python, 4.3.4, 2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f47c90c-8952-4824-adf1-a65d3550c2cd",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "k=5\n",
    "num_val_samples = len(train_data) // k \n",
    "num_epochs = 8\n",
    "batch_size = 230\n",
    "all_loss  = []\n",
    "all_accuracy = []\n",
    "for i in range(k):\n",
    "    print('processing fold #%d' % i)\n",
    "    val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]\n",
    "    val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]\n",
    "    partial_train_data = np.concatenate(\n",
    "        [train_data[:i * num_val_samples],\n",
    "         train_data[(i + 1) * num_val_samples:]],\n",
    "        axis=0)\n",
    "    partial_train_targets = np.concatenate(\n",
    "        [train_targets[:i * num_val_samples],\n",
    "         train_targets[(i + 1) * num_val_samples:]],\n",
    "        axis=0)\n",
    "    Dcnn.fit(partial_train_data, partial_train_targets,\n",
    "             epochs=num_epochs,\n",
    "             validation_data=(val_data, val_targets),\n",
    "             batch_size=batch_size,\n",
    "             callbacks=[early_stop], \n",
    "             verbose=1)\n",
    "    loss, accuracy = Dcnn.evaluate(val_data, val_targets, verbose=0, batch_size=batch_size)\n",
    "    all_accuracy.append(accuracy)\n",
    "    all_loss.append(loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7637b49-d6f3-4d8a-ad6f-706808de7084",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "np.mean(all_accuracy)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "384f9370-c9e5-4d80-8014-66581fafac82",
   "metadata": {},
   "source": [
    "### Stage 5: Storing the Validation Logs "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b97cefc2-458e-4827-8df7-b49c7a74596f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "k=5\n",
    "num_val_samples = len(train_data) // k \n",
    "num_epochs = 10\n",
    "batch_size = 230\n",
    "early_stop = EarlyStopping(monitor='val_loss',patience=1, verbose=1)\n",
    "loss = [] \n",
    "accuracy = []\n",
    "val_loss = []\n",
    "val_accuracy = []\n",
    "df= pd.DataFrame()\n",
    "for i in range(k):\n",
    "    print('processing fold #%d' % i)\n",
    "    val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]\n",
    "    val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]\n",
    "    partial_train_data = np.concatenate(\n",
    "        [train_data[:i * num_val_samples],\n",
    "         train_data[(i + 1) * num_val_samples:]],\n",
    "        axis=0)\n",
    "    partial_train_targets = np.concatenate(\n",
    "        [train_targets[:i * num_val_samples],\n",
    "         train_targets[(i + 1) * num_val_samples:]],\n",
    "        axis=0)\n",
    "    history = Dcnn.fit(partial_train_data, \n",
    "                       partial_train_targets,\n",
    "                       epochs=num_epochs,\n",
    "                       validation_data=(val_data, val_targets),\n",
    "                       batch_size=batch_size,\n",
    "                       #callbacks=[early_stop], \n",
    "                       verbose=0)\n",
    "    loss = history.history['loss']\n",
    "    accuracy = history.history['accuracy']\n",
    "    val_loss = history.history['val_loss']\n",
    "    val_accuracy = history.history['val_accuracy']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09412aca-7cf7-4111-834f-cb41e184eab0",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#Building the history of successive mean K-fold validation scores\n",
    "average_mae_history = [np.mean([x[i] for x in accuracy]) for i in range(num_epochs)]          \n",
    "  "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ef89ca79-2053-490a-9274-077288cad96e",
   "metadata": {},
   "source": [
    "### Stage 6: Training the Final Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03598dbd-0ff3-494b-a6ca-415857c2d735",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "Dcnn.fit(train_data, train_targets,\n",
    "         batch_size=BATCH_SIZE,\n",
    "         epochs=7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23ed354d-956b-49e1-9f3a-71fa6bb59ea5",
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluation_model = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)\n",
    "print(evaluation_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6cfedebb-ab12-486d-a636-6055129fe7d9",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_data, test_data, train_targets, test_targets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1110ede8-f9ef-4d92-b446-de6b5a48af81",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import classification_report,confusion_matrix\n",
    "\n",
    "# pd.DataFrame(confusion_matrix(test_labels,predictions))\n",
    "predictions = Dcnn.predict(test_data)\n",
    "predictions = np.where(predictions >0.8 , 1, 0)\n",
    "print(classification_report(test_targets, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce5747b8-febb-4423-bb04-23210960b1b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "t = pd.DataFrame(confusion_matrix(test_targets,predictions), \n",
    "                 columns=['Predictions: Not Pork(0)','Predictions:Pork(1)'])\n",
    "t.index = ['Acutal: Not Pork(0)', 'Acutal: Pork (1)']\n",
    "t"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}