{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "import numpy as np\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_iris(ratio=0.8):\n",
    "    features, target = datasets.load_iris(True)\n",
    "    \n",
    "    num_samples = len(target)\n",
    "    num_train = math.ceil(num_samples * ratio)\n",
    "    \n",
    "    # 随机打乱数据\n",
    "    idx = np.random.permutation(np.arange(num_samples))\n",
    "    traindata = features[idx[:num_train]], target[idx[:num_train]]\n",
    "    validdata = features[idx[num_train:]], target[idx[num_train:]]\n",
    "    \n",
    "    return traindata, validdata"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 作业三\n",
    "\n",
    "## 二、朴素贝叶斯分类器\n",
    "\n",
    "要求：\n",
    "\n",
    "* 用朴素贝叶斯构造一个iris数据集的分类器\n",
    "* 在尽量不修改代码结构的前提下完成工作\n",
    "\n",
    "ETA：1-5 hours"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 只需要修改这一部分 -- 代码量在40行以内\n",
    "class NaiveBayes:\n",
    "    def __init__(self, smooth = 1):\n",
    "        self.smooth = smooth # lambda\n",
    "        self.conditional_prob = None # 条件概率\n",
    "        self.prior_prob = None # 先验概率\n",
    "        \n",
    "        \n",
    "    def __call__(self, features):\n",
    "        return self.predict(features)\n",
    "        \n",
    "        \n",
    "    def fit(self, features, target):\n",
    "        \"\"\"\n",
    "        \n",
    "        给定特征及真实结果，拟合分类器\n",
    "        \n",
    "        将预测过程中所需要用到的条件概率及先验概率全部计算好\n",
    "        \"\"\"\n",
    "        \n",
    "        self.num_features = features.shape[-1] # 特征的数目；iris数据集中共有4个特征\n",
    "        self.target_labels = np.unique(target) # 预测结果的可能值：c_k\n",
    "        # features_labels[j][l]表示第 j 个特征的第 l 个可能值: a_{jl}\n",
    "        self.features_labels = [np.unique(features[:, i]) for i in range(features.shape[-1])]\n",
    "        \n",
    "        self.conditional_prob = self._conditional_prob(features, target)\n",
    "        self.prior_prob = self._prior_prob(target)\n",
    "        \n",
    "        \n",
    "    def predict(self, features):\n",
    "        # 预测单个数据\n",
    "        if len(features.shape) == 1:\n",
    "            return np.array([self._predict_single(features)])\n",
    "        # 批量预测\n",
    "        elif len(features.shape) == 2:\n",
    "            N = features.shape[0]\n",
    "            return np.array([self._predict_single(features[i, :]) for i in range(N)])\n",
    "        else:\n",
    "            raise(ValueError(\"Unsupported features size, should be 1 or 2 dimensional\"))\n",
    "            \n",
    "    \n",
    "    def _predict_single(self, feature):\n",
    "        # 实现它\n",
    "        raise(NotImplementedError())\n",
    "        \n",
    "    \n",
    "    def _prior_prob(self, target):\n",
    "        # 利用式4.11估计先验概率\n",
    "        N = len(target)\n",
    "        K = len(self.target_labels)\n",
    "        \n",
    "        # 实现它\n",
    "        raise(NotImplementedError())\n",
    "        \n",
    "        return prob\n",
    "    \n",
    "    \n",
    "    def _conditional_prob(self, features, target):\n",
    "        # 利用式4.10估计条件概率\n",
    "        # prob[k][j][l] is P(X^{(j)} = a_{jl} | Y = c_k)\n",
    "        prob = [[np.zeros(feature.shape) for feature in self.features_labels] \n",
    "                for _ in self.target_labels] # prob里一共存储了多少个数？\n",
    "        \n",
    "        # 实现它\n",
    "        raise(NotImplementedError())\n",
    "        \n",
    "        return prob"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "(X_train, Y_train), (X_valid, Y_valid) = load_iris()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "创建模型并拟合数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = NaiveBayes()\n",
    "model.fit(X_train, Y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "预测结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy: 0.8333\n"
     ]
    }
   ],
   "source": [
    "Y_pred = model.predict(X_valid)\n",
    "accuracy = np.sum(Y_pred == Y_valid)/len(Y_valid)\n",
    "print(f\"accuracy: {accuracy:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "AI-Course",
   "language": "python",
   "name": "ai-course"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}