# Settings

In [None]:
featureMatrixFilePath = 'E:/MotionMatchingFeatureMatrix.csv'

# Load feature matrix

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA

def PrintGreen(text):
    print('\x1b[6;30;42m' + text + '\x1b[0m')
    
def PrintRed(text):
    print('\33[41m' + text + '\x1b[0m')

In [None]:
# Load the feature matrix from CSV
originalData = pd.read_csv(featureMatrixFilePath, na_values = 'null')
if originalData.shape[0] > 0 and originalData.shape[1] > 0:
    PrintGreen("Loading succeeded");
else:
    PrintRed("Loading failed!");

print("frames = " + str(originalData.shape[0]))
print("featureComponents = " + str(originalData.shape[1]))

# Ensure to show all columns
pd.set_option('max_columns', originalData.shape[1])

In [None]:
originalData.head(15)

# Data preparation

1. Data Cleaning: We will remove unused feature components that are zeroed out for now as they are not implemented yet.
2. Feature Selection: Happened in the motion matching gem. So far we have a position, velocity and a trajectory feature.
3. Data Transformation: We will change the scale of our features by normalizing it using min-max normalization. We do not modify the distribution for now.
4. Feature Engineering / Data Augmentation: We will not derive new variables for now.
5. Dimensionality Reduction: We will not create compact projections of the data for now.

# Data cleaning
Remove columns containing only 0.0

In [None]:
def CleanData(data):
    # Remove columns with only zeros
    cleanedData = data[data.columns[(data != 0).any()]]
    
    if cleanedData.shape[0] != data.shape[0]:
        PrintRed("Frame count of original and cleaned data should match!")
    
    if cleanedData.shape[1] < data.shape[1]:
        PrintGreen(str(data.shape[1] - cleanedData.shape[1]) + " feature components containing only 0.0 values removed");
    
    print("frames = " + str(cleanedData.shape[0]))
    print("featureComponents = " + str(cleanedData.shape[1]))
    
    return cleanedData


cleanedData = CleanData(originalData);
frameCount = cleanedData.shape[0]
cleanedFeatureComponentCount = cleanedData.shape[1]

In [None]:
cleanedData.head(15)

# Feature analysis visualizations

## Histogram per feature component showing value distributions

In [None]:
def Histogram(data):
    image = data.hist(figsize = [32, 32])

    
Histogram(cleanedData)

## Boxplot per feature component
Median in orange inside the box<br/>
Box = Interquartile range, which means 50% of the data lies within the box<br/>
Black line range = 99,3% of the values<br/>
Semi-transparent outliers represent the rest 0.7%<br/>

In [None]:
def BoxPlot(data, featureComponentCount):
    minValuePerColumn = data.min(axis=0)
    maxValuePerColumn = data.max(axis=0)

    fig1, ax1 = plt.subplots(figsize=(20,20))
    ax1.set_title('Feature Component Boxplot')

    # Render outliers
    flierprops = dict(marker='o', markerfacecolor='gainsboro', markersize=1, linestyle='none', markeredgecolor='gainsboro', alpha=0.005)
    ax1.boxplot(data, vert=False, flierprops=flierprops)

    # Create an array containing values ranging from 1 to featureComponentCount
    elementNumbers = np.array([i+1 for i in range(featureComponentCount)])

    plt.yticks(elementNumbers, data.columns)
    plt.show()


BoxPlot(cleanedData, cleanedData.shape[1])

## Feature correlation heatmap

In [None]:
# not used in drawing, this just prints the values
correlationMatrix = cleanedData.corr()

# plot the correlation heatmap
plt.figure(figsize=[32, 32])
sns.heatmap(data=correlationMatrix)

## Scatterplot using PCA
Use principal component analysis to project the multi-dimensional data down to 2D

In [None]:
def ScatterPlotPCA(data):
    pca = PCA(n_components=2)
    pca.fit(data)
    pcaData = pca.transform(data)
    
    pca_x = pcaData[:, 0]
    pca_y = pcaData[:, 1]
    plt.figure(figsize=(16, 16))
    plt.scatter(pca_x, pca_y, s=2.0, alpha=0.5)

    
ScatterPlotPCA(cleanedData)

# Data Transformation
# Normalization

In [None]:
# mean normalization
# normalized_df=(df-df.mean())/df.std()

# min-max normalization
# normalized_df=(df-df.min())/(df.max()-df.min())

# Note: Pandas automatically applies colomn-wise function in the code above.

# Using sklearn
x = cleanedData.values
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
x_scaled = min_max_scaler.fit_transform(x)

normalizedData = pd.DataFrame(data=x_scaled, columns=cleanedData.columns) # copy column names from source

# min values per column used to normalize the data
print("Minimum values per feature component / column")
print(min_max_scaler.data_min_)
print("")

# max values per column used to normalize the data
print("Maximum values per feature component / column")
print(min_max_scaler.data_max_)

In [None]:
normalizedData.head(15)

In [None]:
Histogram(normalizedData)

In [None]:
ScatterPlotPCA(normalizedData)