import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("responses.csv")
data.head()
Music Slow songs or fast songs Dance Folk Country Classical music Musical Pop Rock Metal or Hardrock ... Age Height Weight Number of siblings Gender Left - right handed Education Only child Village - town House - block of flats
0 5.0 3.0 2.0 1.0 2.0 2.0 1.0 5.0 5.0 1.0 ... 20.0 163.0 48.0 1.0 female right handed college/bachelor degree no village block of flats
1 4.0 4.0 2.0 1.0 1.0 1.0 2.0 3.0 5.0 4.0 ... 19.0 163.0 58.0 2.0 female right handed college/bachelor degree no city block of flats
2 5.0 5.0 2.0 2.0 3.0 4.0 5.0 3.0 5.0 3.0 ... 20.0 176.0 67.0 2.0 female right handed secondary school no city block of flats
3 5.0 3.0 2.0 1.0 1.0 1.0 1.0 2.0 2.0 1.0 ... 22.0 172.0 59.0 1.0 female right handed college/bachelor degree yes city house/bungalow
4 5.0 3.0 4.0 3.0 2.0 4.0 3.0 5.0 3.0 1.0 ... 20.0 170.0 59.0 1.0 female right handed secondary school no village house/bungalow

5 rows × 150 columns

df = data.iloc[:,0:19]
df["Age"] = data["Age"]
df["Height"] = data["Height"]
df["Weight"] = data["Weight"]
df["Siblings"] = data["Number of siblings"]
df["Gender"] = data["Gender"]
df["Education"] = data["Education"]
df["Location"] = data["Village - town"]
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Music                     1007 non-null   float64
 1   Slow songs or fast songs  1008 non-null   float64
 2   Dance                     1006 non-null   float64
 3   Folk                      1005 non-null   float64
 4   Country                   1005 non-null   float64
 5   Classical music           1003 non-null   float64
 6   Musical                   1008 non-null   float64
 7   Pop                       1007 non-null   float64
 8   Rock                      1004 non-null   float64
 9   Metal or Hardrock         1007 non-null   float64
 10  Punk                      1002 non-null   float64
 11  Hiphop, Rap               1006 non-null   float64
 12  Reggae, Ska               1003 non-null   float64
 13  Swing, Jazz               1004 non-null   float64
 14  Rock n roll               1003 non-null   float64
 15  Alternative               1003 non-null   float64
 16  Latino                    1002 non-null   float64
 17  Techno, Trance            1003 non-null   float64
 18  Opera                     1009 non-null   float64
 19  Age                       1003 non-null   float64
 20  Height                    990 non-null    float64
 21  Weight                    990 non-null    float64
 22  Siblings                  1004 non-null   float64
 23  Gender                    1004 non-null   object 
 24  Education                 1009 non-null   object 
 25  Location                  1006 non-null   object 
dtypes: float64(23), object(3)
memory usage: 205.3+ KB
None
df.dropna(inplace = True)
df.reset_index(drop=True,inplace=True)
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Music                     898 non-null    float64
 1   Slow songs or fast songs  898 non-null    float64
 2   Dance                     898 non-null    float64
 3   Folk                      898 non-null    float64
 4   Country                   898 non-null    float64
 5   Classical music           898 non-null    float64
 6   Musical                   898 non-null    float64
 7   Pop                       898 non-null    float64
 8   Rock                      898 non-null    float64
 9   Metal or Hardrock         898 non-null    float64
 10  Punk                      898 non-null    float64
 11  Hiphop, Rap               898 non-null    float64
 12  Reggae, Ska               898 non-null    float64
 13  Swing, Jazz               898 non-null    float64
 14  Rock n roll               898 non-null    float64
 15  Alternative               898 non-null    float64
 16  Latino                    898 non-null    float64
 17  Techno, Trance            898 non-null    float64
 18  Opera                     898 non-null    float64
 19  Age                       898 non-null    float64
 20  Height                    898 non-null    float64
 21  Weight                    898 non-null    float64
 22  Siblings                  898 non-null    float64
 23  Gender                    898 non-null    object 
 24  Education                 898 non-null    object 
 25  Location                  898 non-null    object 
dtypes: float64(23), object(3)
memory usage: 182.5+ KB
None
for each in range(0,26) :
    if type(df.iloc[1,each]) == np.float64 :
        df[df.columns[each]] = df[df.columns[each]].astype(int)
    else :
        df[df.columns[each]] = df[df.columns[each]]
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Music                     898 non-null    int64 
 1   Slow songs or fast songs  898 non-null    int64 
 2   Dance                     898 non-null    int64 
 3   Folk                      898 non-null    int64 
 4   Country                   898 non-null    int64 
 5   Classical music           898 non-null    int64 
 6   Musical                   898 non-null    int64 
 7   Pop                       898 non-null    int64 
 8   Rock                      898 non-null    int64 
 9   Metal or Hardrock         898 non-null    int64 
 10  Punk                      898 non-null    int64 
 11  Hiphop, Rap               898 non-null    int64 
 12  Reggae, Ska               898 non-null    int64 
 13  Swing, Jazz               898 non-null    int64 
 14  Rock n roll               898 non-null    int64 
 15  Alternative               898 non-null    int64 
 16  Latino                    898 non-null    int64 
 17  Techno, Trance            898 non-null    int64 
 18  Opera                     898 non-null    int64 
 19  Age                       898 non-null    int64 
 20  Height                    898 non-null    int64 
 21  Weight                    898 non-null    int64 
 22  Siblings                  898 non-null    int64 
 23  Gender                    898 non-null    object
 24  Education                 898 non-null    object
 25  Location                  898 non-null    object
dtypes: int64(23), object(3)
memory usage: 182.5+ KB
None
print(df['Music'].value_counts(dropna =False))
Music
5    735
4    114
3     30
2     10
1      9
Name: count, dtype: int64
filtre = df.Music < 4
filt_list = list(df[filtre].index)
i=0
for each in filt_list:
    df.drop(df.index[each-i], inplace=True)
    i=i+1
df.reset_index(drop=True,inplace=True)
row = len(df.index)
print(df['Music'].value_counts(dropna =False))
Music
5    735
4    114
Name: count, dtype: int64
df.drop(['Music'], axis=1,inplace = True)
df.head()
Slow songs or fast songs Dance Folk Country Classical music Musical Pop Rock Metal or Hardrock Punk ... Latino Techno, Trance Opera Age Height Weight Siblings Gender Education Location
0 3 2 1 2 2 1 5 5 1 1 ... 1 1 1 20 163 48 1 female college/bachelor degree village
1 4 2 1 1 1 2 3 5 4 4 ... 2 1 1 19 163 58 2 female college/bachelor degree city
2 5 2 2 3 4 5 3 5 3 4 ... 5 1 3 20 176 67 2 female secondary school city
3 3 2 1 1 1 1 2 2 1 4 ... 1 2 1 22 172 59 1 female college/bachelor degree city
4 3 4 3 2 4 3 5 3 1 2 ... 4 2 2 20 170 59 1 female secondary school village

5 rows × 25 columns

dummies1 = pd.get_dummies(df.Gender, dtype=int)
# dummies2 = pd.get_dummies(df.Education, dtype=int)
dummies3 = pd.get_dummies(df.Location, dtype=int)
df = pd.concat([df, dummies1, dummies3], axis='columns')
# let education be represented as 0,1,2,...
for education in range(0,row) :
    if df.loc[education,'Education'] == 'currently a primary school pupil' :
        df.loc[education,'Education'] = 0
    elif df.loc[education,'Education'] == 'primary school':
        df.loc[education,'Education'] = 1
    elif df.loc[education,'Education'] == 'secondary school':
        df.loc[education,'Education'] = 2
    elif df.loc[education,'Education'] == 'college/bachelor degree':
        df.loc[education,'Education'] = 3
    elif df.loc[education,'Education'] == 'masters degree':
        df.loc[education,'Education'] = 4
    elif df.loc[education,'Education'] == 'doctorate degree':
        df.loc[education,'Education'] = 5
    # in case i missed any
    else :
        df.loc[education,'Education'] = 6
df['Education'] = df['Education'].astype(int)
df.drop(['Gender','Location'], axis='columns',inplace = True)
# let gender be represented in one column as 0 or 1
df['Gender'] = df['female']
# let location be represented in one column as 0 or 1
df['Location'] = df['city']
# avoid dummy variable trap
df.drop(['female', 'male', 'city', 'village'], axis='columns',inplace = True)
# df.drop(['currently a primary school pupil', 'masters degree', 'doctorate degree'], axis='columns',inplace = True)
df.head()
Slow songs or fast songs Dance Folk Country Classical music Musical Pop Rock Metal or Hardrock Punk ... Latino Techno, Trance Opera Age Height Weight Siblings Education Gender Location
0 3 2 1 2 2 1 5 5 1 1 ... 1 1 1 20 163 48 1 3 1 0
1 4 2 1 1 1 2 3 5 4 4 ... 2 1 1 19 163 58 2 3 1 1
2 5 2 2 3 4 5 3 5 3 4 ... 5 1 3 20 176 67 2 2 1 1
3 3 2 1 1 1 1 2 2 1 4 ... 1 2 1 22 172 59 1 3 1 1
4 3 4 3 2 4 3 5 3 1 2 ... 4 2 2 20 170 59 1 2 1 0

5 rows × 25 columns

df.dtypes
Slow songs or fast songs    int64
Dance                       int64
Folk                        int64
Country                     int64
Classical music             int64
Musical                     int64
Pop                         int64
Rock                        int64
Metal or Hardrock           int64
Punk                        int64
Hiphop, Rap                 int64
Reggae, Ska                 int64
Swing, Jazz                 int64
Rock n roll                 int64
Alternative                 int64
Latino                      int64
Techno, Trance              int64
Opera                       int64
Age                         int64
Height                      int64
Weight                      int64
Siblings                    int64
Education                   int64
Gender                      int64
Location                    int64
dtype: object
f,ax = plt.subplots(figsize=(25, 20))
sns.heatmap(df.corr(), annot=True, linewidths=1, fmt= '.1f',ax=ax)
plt.show()
for target in df.columns[:18]:
    df[target] = df.apply(lambda row : row[target]/5, axis = 1)
df.head()
Slow songs or fast songs Dance Folk Country Classical music Musical Pop Rock Metal or Hardrock Punk ... Latino Techno, Trance Opera Age Height Weight Siblings Education Gender Location
0 0.6 0.4 0.2 0.4 0.4 0.2 1.0 1.0 0.2 0.2 ... 0.2 0.2 0.2 20 163 48 1 3 1 0
1 0.8 0.4 0.2 0.2 0.2 0.4 0.6 1.0 0.8 0.8 ... 0.4 0.2 0.2 19 163 58 2 3 1 1
2 1.0 0.4 0.4 0.6 0.8 1.0 0.6 1.0 0.6 0.8 ... 1.0 0.2 0.6 20 176 67 2 2 1 1
3 0.6 0.4 0.2 0.2 0.2 0.2 0.4 0.4 0.2 0.8 ... 0.2 0.4 0.2 22 172 59 1 3 1 1
4 0.6 0.8 0.6 0.4 0.8 0.6 1.0 0.6 0.2 0.4 ... 0.8 0.4 0.4 20 170 59 1 2 1 0

5 rows × 25 columns

TargetVariable=df.columns[:18]
Predictors=df.columns[18:]
 
X=df[Predictors].values
y=df[TargetVariable].values
 
### Sandardization of data ###
from sklearn.preprocessing import StandardScaler
PredictorScaler=StandardScaler()
TargetVarScaler=StandardScaler()
 
# Storing the fit object for later reference
PredictorScalerFit=PredictorScaler.fit(X)
TargetVarScalerFit=TargetVarScaler.fit(y)
 
# Generating the standardized values of X and y
X=PredictorScalerFit.transform(X)
y=TargetVarScalerFit.transform(y)
 
# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69420)
 
# Quick sanity check with the shapes of Training and testing datasets
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(679, 7)
(679, 18)
(170, 7)
(170, 18)
run = False
if run:
    # Function to generate Deep ANN model 
    def make_regression_ann(Optimizer_trial):
        from keras.models import Sequential
        from keras.layers import Dense
        
        model = Sequential()
        model.add(Dense(units=64, input_dim=7, kernel_initializer='normal', activation='relu'))
        model.add(Dense(units=64, kernel_initializer='normal', activation='tanh'))
        model.add(Dense(units=64, kernel_initializer='normal', activation='relu'))
        model.add(Dense(18, kernel_initializer='normal'))
        model.compile(loss='mean_squared_error', optimizer=Optimizer_trial)
        return model

    ###########################################
    from sklearn.model_selection import GridSearchCV
    from keras.wrappers.scikit_learn import KerasRegressor

    # Listing all the parameters to try
    Parameter_Trials={'batch_size':[16,32,64,128,256,512],
                        'epochs':[5,10,20,50,100],
                        'Optimizer_trial':['adam', 'rmsprop']
                    }

    # Creating the regression ANN model
    RegModel=KerasRegressor(make_regression_ann, verbose=0)

    ###########################################
    from sklearn.metrics import make_scorer

    # Defining a custom function to calculate accuracy
    def Accuracy_Score(orig,pred):
        MAPE = np.mean(100 * (np.abs(orig-pred)/orig))
        print('#'*70,'Accuracy:', 100-MAPE)
        return(100-MAPE)

    custom_Scoring=make_scorer(Accuracy_Score, greater_is_better=True)

    #########################################
    # Creating the Grid search space
    # See different scoring methods by using sklearn.metrics.SCORERS.keys()
    grid_search=GridSearchCV(estimator=RegModel, 
                            param_grid=Parameter_Trials, 
                            scoring=custom_Scoring, 
                            cv=5)

    #########################################
    # Measuring how much time it took to find the best params
    import time
    StartTime=time.time()

    # Running Grid Search for different paramenters
    grid_search.fit(X,y, verbose=1)

    EndTime=time.time()
    print("########## Total Time Taken: ", round((EndTime-StartTime)/60), 'Minutes')

    print('### Printing Best parameters ###')
    grid_search.best_params_
from keras.models import Sequential
from keras.layers import Dense, Dropout
 
# create ANN model
model = Sequential()

# Dropout layer
model.add(Dropout(0.2, input_shape=(7,)))

# Defining the Input layer and FIRST hidden layer, both are same!
model.add(Dense(units=64, input_dim=7, kernel_initializer='normal', activation='relu'))
 
# Defining the Second layer of the model
# after the first layer we don't have to specify input_dim as keras configure it automatically
model.add(Dense(units=64, kernel_initializer='normal', activation='tanh'))

# Third layer
model.add(Dense(units=64, kernel_initializer='normal', activation='relu'))
 
# The output neuron is a single fully connected node 
model.add(Dense(18, kernel_initializer='normal'))
 
# Compiling the model
model.compile(loss='mean_squared_error', optimizer='adam')
 
# Fitting the ANN to the Training set
model.fit(X_train, y_train ,batch_size = 512, epochs = 5, verbose=1)

# Generating Predictions on testing data
Predictions=model.predict(X_test)
 
# Scaling the predicted Price data back to original price scale
Predictions=TargetVarScalerFit.inverse_transform(Predictions)
 
# Scaling the y_test Price data back to original price scale
y_test_orig=TargetVarScalerFit.inverse_transform(y_test)
 
# Scaling the test data back to original scale
Test_Data=PredictorScalerFit.inverse_transform(X_test)
 
TestingData=pd.DataFrame(data=Test_Data, columns=Predictors)
TestingData[df.columns[:18]]=y_test_orig
PredNames = ['Predicted ' + target for target in df.columns[:18]]
TestingData[PredNames]=Predictions
TestingData.head()
Epoch 1/5
2/2 [==============================] - 1s 5ms/step - loss: 0.9964
Epoch 2/5
2/2 [==============================] - 0s 5ms/step - loss: 0.9958
Epoch 3/5
2/2 [==============================] - 0s 6ms/step - loss: 0.9952
Epoch 4/5
2/2 [==============================] - 0s 5ms/step - loss: 0.9944
Epoch 5/5
2/2 [==============================] - 0s 4ms/step - loss: 0.9937
6/6 [==============================] - 0s 1ms/step
Age Height Weight Siblings Education Gender Location Slow songs or fast songs Dance Folk ... Predicted Metal or Hardrock Predicted Punk Predicted Hiphop, Rap Predicted Reggae, Ska Predicted Swing, Jazz Predicted Rock n roll Predicted Alternative Predicted Latino Predicted Techno, Trance Predicted Opera
0 21.0 170.0 52.0 1.0 2.0 1.0 1.0 0.6 0.8 0.2 ... 0.472132 0.491922 0.585427 0.556206 0.560958 0.638533 0.573585 0.572241 0.469346 0.433011
1 22.0 182.0 71.0 1.0 2.0 0.0 0.0 0.8 0.8 0.4 ... 0.478891 0.497449 0.587976 0.558163 0.557445 0.640600 0.574664 0.563489 0.477443 0.432210
2 19.0 180.0 77.0 1.0 2.0 0.0 1.0 0.6 0.6 0.6 ... 0.478754 0.491955 0.590176 0.553323 0.558342 0.635931 0.576548 0.563108 0.476471 0.431431
3 18.0 170.0 50.0 1.0 2.0 1.0 1.0 0.6 0.8 0.4 ... 0.472186 0.492355 0.586317 0.555870 0.560950 0.638502 0.574939 0.571488 0.469721 0.432531
4 21.0 173.0 62.0 1.0 2.0 1.0 1.0 0.4 0.6 0.2 ... 0.472478 0.491221 0.586158 0.555400 0.560664 0.637893 0.574248 0.571415 0.470320 0.432865

5 rows × 43 columns

APEs = []
for col in df.columns[:18]:
    APE = 100*(abs(TestingData[col]-TestingData['Predicted ' + col])/TestingData[col])
    APEs.append(APE)
    TestingData[col + ' APE'] = APE
 
print('The Accuracy of ANN model is:', 100-np.mean(APEs))
TestingData.head()
The Accuracy of ANN model is: 44.868009843516596
Age Height Weight Siblings Education Gender Location Slow songs or fast songs Dance Folk ... Metal or Hardrock APE Punk APE Hiphop, Rap APE Reggae, Ska APE Swing, Jazz APE Rock n roll APE Alternative APE Latino APE Techno, Trance APE Opera APE
0 21.0 170.0 52.0 1.0 2.0 1.0 1.0 0.6 0.8 0.2 ... 136.065999 145.961010 26.821649 178.102815 180.479074 59.633234 186.792725 42.775947 134.673157 8.252682
1 22.0 182.0 71.0 1.0 2.0 0.0 0.0 0.8 0.8 0.4 ... 40.138593 24.362245 2.003944 179.081643 30.319361 19.924967 4.222677 6.085243 20.426160 116.105118
2 19.0 180.0 77.0 1.0 2.0 0.0 1.0 0.6 0.6 0.6 ... 139.377096 22.988768 47.543982 7.779475 44.165754 5.988433 42.345172 6.148682 19.117625 46.071181
3 18.0 170.0 50.0 1.0 2.0 1.0 1.0 0.6 0.8 0.4 ... 136.092776 23.088799 46.579176 177.934939 180.474901 219.250852 187.469685 4.752046 134.860420 116.265455
4 21.0 173.0 62.0 1.0 2.0 1.0 1.0 0.4 0.6 0.2 ... 18.119456 145.610565 26.730193 38.849914 40.166074 20.263343 187.124127 42.853871 135.159978 116.432393

5 rows × 61 columns

model.save("genrepredmodel")
2023-06-06 05:04:07.807147: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'dropout_11_input' with dtype float and shape [?,7]
	 [[{{node dropout_11_input}}]]
2023-06-06 05:04:07.871773: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,7]
	 [[{{node inputs}}]]
2023-06-06 05:04:07.892504: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'dropout_11_input' with dtype float and shape [?,7]
	 [[{{node dropout_11_input}}]]
2023-06-06 05:04:07.930261: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,7]
	 [[{{node inputs}}]]
2023-06-06 05:04:07.964044: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,7]
	 [[{{node inputs}}]]
2023-06-06 05:04:08.031101: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,7]
	 [[{{node inputs}}]]
2023-06-06 05:04:08.089239: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,7]
	 [[{{node inputs}}]]
WARNING:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.
INFO:tensorflow:Assets written to: genrepredmodel/assets
INFO:tensorflow:Assets written to: genrepredmodel/assets