{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d57026d3",
   "metadata": {},
   "source": [
    "# Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26e1d687",
   "metadata": {},
   "outputs": [],
   "source": [
    "featureMatrixFilePath = 'E:/MotionMatchingFeatureMatrix.csv'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a45e3d25",
   "metadata": {},
   "source": [
    "# Load feature matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25a44238",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from sklearn import preprocessing\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "def PrintGreen(text):\n",
    "    print('\\x1b[6;30;42m' + text + '\\x1b[0m')\n",
    "    \n",
    "def PrintRed(text):\n",
    "    print('\\33[41m' + text + '\\x1b[0m')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5bdc881e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the feature matrix from CSV\n",
    "originalData = pd.read_csv(featureMatrixFilePath, na_values = 'null')\n",
    "if originalData.shape[0] > 0 and originalData.shape[1] > 0:\n",
    "    PrintGreen(\"Loading succeeded\");\n",
    "else:\n",
    "    PrintRed(\"Loading failed!\");\n",
    "\n",
    "print(\"frames = \" + str(originalData.shape[0]))\n",
    "print(\"featureComponents = \" + str(originalData.shape[1]))\n",
    "\n",
    "# Ensure to show all columns\n",
    "pd.set_option('max_columns', originalData.shape[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e41c7caf",
   "metadata": {},
   "outputs": [],
   "source": [
    "originalData.head(15)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3bbc348",
   "metadata": {},
   "source": [
    "# Data preparation\n",
    "\n",
    "1. Data Cleaning: We will remove unused feature components that are zeroed out for now as they are not implemented yet.\n",
    "2. Feature Selection: Happened in the motion matching gem. So far we have a position, velocity and a trajectory feature.\n",
    "3. Data Transformation: We will change the scale of our features by normalizing it using min-max normalization. We do not modify the distribution for now.\n",
    "4. Feature Engineering / Data Augmentation: We will not derive new variables for now.\n",
    "5. Dimensionality Reduction: We will not create compact projections of the data for now.\n",
    "\n",
    "# Data cleaning\n",
    "Remove columns containing only 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a64a465",
   "metadata": {},
   "outputs": [],
   "source": [
    "def CleanData(data):\n",
    "    # Remove columns with only zeros\n",
    "    cleanedData = data[data.columns[(data != 0).any()]]\n",
    "    \n",
    "    if cleanedData.shape[0] != data.shape[0]:\n",
    "        PrintRed(\"Frame count of original and cleaned data should match!\")\n",
    "    \n",
    "    if cleanedData.shape[1] < data.shape[1]:\n",
    "        PrintGreen(str(data.shape[1] - cleanedData.shape[1]) + \" feature components containing only 0.0 values removed\");\n",
    "    \n",
    "    print(\"frames = \" + str(cleanedData.shape[0]))\n",
    "    print(\"featureComponents = \" + str(cleanedData.shape[1]))\n",
    "    \n",
    "    return cleanedData\n",
    "\n",
    "\n",
    "cleanedData = CleanData(originalData);\n",
    "frameCount = cleanedData.shape[0]\n",
    "cleanedFeatureComponentCount = cleanedData.shape[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81b759dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "cleanedData.head(15)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b9e9a8e2",
   "metadata": {},
   "source": [
    "# Feature analysis visualizations"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "643dd550",
   "metadata": {},
   "source": [
    "## Histogram per feature component showing value distributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3cbe721",
   "metadata": {},
   "outputs": [],
   "source": [
    "def Histogram(data):\n",
    "    image = data.hist(figsize = [32, 32])\n",
    "\n",
    "    \n",
    "Histogram(cleanedData)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "06cc1e64",
   "metadata": {},
   "source": [
    "## Boxplot per feature component\n",
    "Median in orange inside the box<br/>\n",
    "Box = Interquartile range, which means 50% of the data lies within the box<br/>\n",
    "Black line range = 99,3% of the values<br/>\n",
    "Semi-transparent outliers represent the rest 0.7%<br/>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ff8c258",
   "metadata": {},
   "outputs": [],
   "source": [
    "def BoxPlot(data, featureComponentCount):\n",
    "    minValuePerColumn = data.min(axis=0)\n",
    "    maxValuePerColumn = data.max(axis=0)\n",
    "\n",
    "    fig1, ax1 = plt.subplots(figsize=(20,20))\n",
    "    ax1.set_title('Feature Component Boxplot')\n",
    "\n",
    "    # Render outliers\n",
    "    flierprops = dict(marker='o', markerfacecolor='gainsboro', markersize=1, linestyle='none', markeredgecolor='gainsboro', alpha=0.005)\n",
    "    ax1.boxplot(data, vert=False, flierprops=flierprops)\n",
    "\n",
    "    # Create an array containing values ranging from 1 to featureComponentCount\n",
    "    elementNumbers = np.array([i+1 for i in range(featureComponentCount)])\n",
    "\n",
    "    plt.yticks(elementNumbers, data.columns)\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "BoxPlot(cleanedData, cleanedData.shape[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "023ab81b",
   "metadata": {},
   "source": [
    "## Feature correlation heatmap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "290aff93",
   "metadata": {},
   "outputs": [],
   "source": [
    "# not used in drawing, this just prints the values\n",
    "correlationMatrix = cleanedData.corr()\n",
    "\n",
    "# plot the correlation heatmap\n",
    "plt.figure(figsize=[32, 32])\n",
    "sns.heatmap(data=correlationMatrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ce964ae",
   "metadata": {},
   "source": [
    "## Scatterplot using PCA\n",
    "Use principal component analysis to project the multi-dimensional data down to 2D"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d77c43aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def ScatterPlotPCA(data):\n",
    "    pca = PCA(n_components=2)\n",
    "    pca.fit(data)\n",
    "    pcaData = pca.transform(data)\n",
    "    \n",
    "    pca_x = pcaData[:, 0]\n",
    "    pca_y = pcaData[:, 1]\n",
    "    plt.figure(figsize=(16, 16))\n",
    "    plt.scatter(pca_x, pca_y, s=2.0, alpha=0.5)\n",
    "\n",
    "    \n",
    "ScatterPlotPCA(cleanedData)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e3c4c80a",
   "metadata": {},
   "source": [
    "# Data Transformation\n",
    "# Normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c71d0a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# mean normalization\n",
    "# normalized_df=(df-df.mean())/df.std()\n",
    "\n",
    "# min-max normalization\n",
    "# normalized_df=(df-df.min())/(df.max()-df.min())\n",
    "\n",
    "# Note: Pandas automatically applies colomn-wise function in the code above.\n",
    "\n",
    "# Using sklearn\n",
    "x = cleanedData.values\n",
    "min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))\n",
    "x_scaled = min_max_scaler.fit_transform(x)\n",
    "\n",
    "normalizedData = pd.DataFrame(data=x_scaled, columns=cleanedData.columns) # copy column names from source\n",
    "\n",
    "# min values per column used to normalize the data\n",
    "print(\"Minimum values per feature component / column\")\n",
    "print(min_max_scaler.data_min_)\n",
    "print(\"\")\n",
    "\n",
    "# max values per column used to normalize the data\n",
    "print(\"Maximum values per feature component / column\")\n",
    "print(min_max_scaler.data_max_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b5bfd21",
   "metadata": {},
   "outputs": [],
   "source": [
    "normalizedData.head(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c094066c",
   "metadata": {},
   "outputs": [],
   "source": [
    "Histogram(normalizedData)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b81660c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "ScatterPlotPCA(normalizedData)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}