You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
353 lines
9.1 KiB
Plaintext
353 lines
9.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d57026d3",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Settings"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "26e1d687",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"featureMatrixFilePath = 'E:/MotionMatchingFeatureMatrix.csv'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a45e3d25",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Load feature matrix"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "25a44238",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import seaborn as sns\n",
|
|
"from sklearn import preprocessing\n",
|
|
"from sklearn.decomposition import PCA\n",
|
|
"\n",
|
|
"def PrintGreen(text):\n",
|
|
" print('\\x1b[6;30;42m' + text + '\\x1b[0m')\n",
|
|
" \n",
|
|
"def PrintRed(text):\n",
|
|
" print('\\33[41m' + text + '\\x1b[0m')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5bdc881e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load the feature matrix from CSV\n",
|
|
"originalData = pd.read_csv(featureMatrixFilePath, na_values = 'null')\n",
|
|
"if originalData.shape[0] > 0 and originalData.shape[1] > 0:\n",
|
|
" PrintGreen(\"Loading succeeded\");\n",
|
|
"else:\n",
|
|
" PrintRed(\"Loading failed!\");\n",
|
|
"\n",
|
|
"print(\"frames = \" + str(originalData.shape[0]))\n",
|
|
"print(\"featureComponents = \" + str(originalData.shape[1]))\n",
|
|
"\n",
|
|
"# Ensure to show all columns\n",
|
|
"pd.set_option('max_columns', originalData.shape[1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e41c7caf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"originalData.head(15)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b3bbc348",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Data preparation\n",
|
|
"\n",
|
|
"1. Data Cleaning: We will remove unused feature components that are zeroed out for now as they are not implemented yet.\n",
|
|
"2. Feature Selection: Happened in the motion matching gem. So far we have a position, velocity and a trajectory feature.\n",
|
|
"3. Data Transformation: We will change the scale of our features by normalizing it using min-max normalization. We do not modify the distribution for now.\n",
|
|
"4. Feature Engineering / Data Augmentation: We will not derive new variables for now.\n",
|
|
"5. Dimensionality Reduction: We will not create compact projections of the data for now.\n",
|
|
"\n",
|
|
"# Data cleaning\n",
|
|
"Remove columns containing only 0.0"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2a64a465",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def CleanData(data):\n",
|
|
" # Remove columns with only zeros\n",
|
|
" cleanedData = data[data.columns[(data != 0).any()]]\n",
|
|
" \n",
|
|
" if cleanedData.shape[0] != data.shape[0]:\n",
|
|
" PrintRed(\"Frame count of original and cleaned data should match!\")\n",
|
|
" \n",
|
|
" if cleanedData.shape[1] < data.shape[1]:\n",
|
|
" PrintGreen(str(data.shape[1] - cleanedData.shape[1]) + \" feature components containing only 0.0 values removed\");\n",
|
|
" \n",
|
|
" print(\"frames = \" + str(cleanedData.shape[0]))\n",
|
|
" print(\"featureComponents = \" + str(cleanedData.shape[1]))\n",
|
|
" \n",
|
|
" return cleanedData\n",
|
|
"\n",
|
|
"\n",
|
|
"cleanedData = CleanData(originalData);\n",
|
|
"frameCount = cleanedData.shape[0]\n",
|
|
"cleanedFeatureComponentCount = cleanedData.shape[1]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "81b759dc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cleanedData.head(15)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b9e9a8e2",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Feature analysis visualizations"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "643dd550",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Histogram per feature component showing value distributions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f3cbe721",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def Histogram(data):\n",
|
|
" image = data.hist(figsize = [32, 32])\n",
|
|
"\n",
|
|
" \n",
|
|
"Histogram(cleanedData)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "06cc1e64",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Boxplot per feature component\n",
|
|
"Median in orange inside the box<br/>\n",
|
|
"Box = Interquartile range, which means 50% of the data lies within the box<br/>\n",
|
|
"Black line range = 99,3% of the values<br/>\n",
|
|
"Semi-transparent outliers represent the rest 0.7%<br/>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4ff8c258",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def BoxPlot(data, featureComponentCount):\n",
|
|
" minValuePerColumn = data.min(axis=0)\n",
|
|
" maxValuePerColumn = data.max(axis=0)\n",
|
|
"\n",
|
|
" fig1, ax1 = plt.subplots(figsize=(20,20))\n",
|
|
" ax1.set_title('Feature Component Boxplot')\n",
|
|
"\n",
|
|
" # Render outliers\n",
|
|
" flierprops = dict(marker='o', markerfacecolor='gainsboro', markersize=1, linestyle='none', markeredgecolor='gainsboro', alpha=0.005)\n",
|
|
" ax1.boxplot(data, vert=False, flierprops=flierprops)\n",
|
|
"\n",
|
|
" # Create an array containing values ranging from 1 to featureComponentCount\n",
|
|
" elementNumbers = np.array([i+1 for i in range(featureComponentCount)])\n",
|
|
"\n",
|
|
" plt.yticks(elementNumbers, data.columns)\n",
|
|
" plt.show()\n",
|
|
"\n",
|
|
"\n",
|
|
"BoxPlot(cleanedData, cleanedData.shape[1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "023ab81b",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Feature correlation heatmap"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "290aff93",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# not used in drawing, this just prints the values\n",
|
|
"correlationMatrix = cleanedData.corr()\n",
|
|
"\n",
|
|
"# plot the correlation heatmap\n",
|
|
"plt.figure(figsize=[32, 32])\n",
|
|
"sns.heatmap(data=correlationMatrix)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2ce964ae",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Scatterplot using PCA\n",
|
|
"Use principal component analysis to project the multi-dimensional data down to 2D"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d77c43aa",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def ScatterPlotPCA(data):\n",
|
|
" pca = PCA(n_components=2)\n",
|
|
" pca.fit(data)\n",
|
|
" pcaData = pca.transform(data)\n",
|
|
" \n",
|
|
" pca_x = pcaData[:, 0]\n",
|
|
" pca_y = pcaData[:, 1]\n",
|
|
" plt.figure(figsize=(16, 16))\n",
|
|
" plt.scatter(pca_x, pca_y, s=2.0, alpha=0.5)\n",
|
|
"\n",
|
|
" \n",
|
|
"ScatterPlotPCA(cleanedData)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e3c4c80a",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Data Transformation\n",
|
|
"# Normalization"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8c71d0a5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# mean normalization\n",
|
|
"# normalized_df=(df-df.mean())/df.std()\n",
|
|
"\n",
|
|
"# min-max normalization\n",
|
|
"# normalized_df=(df-df.min())/(df.max()-df.min())\n",
|
|
"\n",
|
|
"# Note: Pandas automatically applies colomn-wise function in the code above.\n",
|
|
"\n",
|
|
"# Using sklearn\n",
|
|
"x = cleanedData.values\n",
|
|
"min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))\n",
|
|
"x_scaled = min_max_scaler.fit_transform(x)\n",
|
|
"\n",
|
|
"normalizedData = pd.DataFrame(data=x_scaled, columns=cleanedData.columns) # copy column names from source\n",
|
|
"\n",
|
|
"# min values per column used to normalize the data\n",
|
|
"print(\"Minimum values per feature component / column\")\n",
|
|
"print(min_max_scaler.data_min_)\n",
|
|
"print(\"\")\n",
|
|
"\n",
|
|
"# max values per column used to normalize the data\n",
|
|
"print(\"Maximum values per feature component / column\")\n",
|
|
"print(min_max_scaler.data_max_)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4b5bfd21",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"normalizedData.head(15)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c094066c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"Histogram(normalizedData)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b81660c2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"ScatterPlotPCA(normalizedData)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.8"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|