Molecular Properties: A Journey through Multiple Linear Regression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import networkx as nx
import seaborn as sns
from sklearn import preprocessing, tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import lightgbm
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers
import keras
import warnings
warnings.filterwarnings('ignore')
df= pd.read_csv('molecule_complete.csv')

df.columns #the names of all the columnsIndex(['molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'potential_energy', 'X', 'Y', 'Z',
       'XX_atom1', 'YX_atom1', 'ZX_atom1', 'XY_atom1', 'YY_atom1', 'ZY_atom1',
       'XZ_atom1', 'YZ_atom1', 'ZZ_atom1', 'XX', 'YX', 'ZX', 'XY', 'YY', 'ZY',
       'XZ', 'YZ', 'ZZ', 'mulliken_charge_atom1', 'mulliken_charge',
       'type_scc', 'fc', 'sd', 'pso', 'dso', 'atom_atom1_structure',
       'x_atom1_structure', 'y_atom1_structure', 'z_atom1_structure', 'atom',
       'x', 'y', 'z'],
      dtype='object')df.select_dtypes(include=[object]) #this shows the categorical variables
The categorical variables among the features
fig, ax = plt.subplots(figsize = (20, 12)) for i, t in enumerate(df[‘type’].unique()): df_type = df.loc[df[‘type’] == t] G = nx.from_pandas_edgelist(df_type, ‘atom_index_0’, ‘atom_index_1’, [‘scalar_coupling_constant’]) plt.subplot(2, 4, i + 1); nx.draw(G, with_labels=True); plt.title(f’Graph for type {t}’)
This is just for the Scalar Constant. The rest of the graph can be found in the final notebook linked above
This shows how much weight each atom type has on different atoms.
While this shows how many times each type shows up altogether.
#this was done for all important features from the LGBM feature extraction model, but I will only show scalar here. 
fig, ax = plt.subplots(figsize = (18, 6))
plt.subplot(1, 2, 1);
plt.hist(df['scalar_coupling_constant'], bins=20);
plt.title('Basic scalar_coupling_constant histogram');
plt.subplot(1, 2, 2);
sns.violinplot(x='type', y='scalar_coupling_constant', data=df);
plt.title('Violinplot of scalar_coupling_constant by type');
The bin shows most scalar numbers are rather small. The violin plot however shows another great visual on how each type plays a critical role on our target variables
for f in ['type', 'type_scc', 'atom_atom1_structure', 'atom']:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(df[f].values))
    df[f] = lbl.transform(list(df[f].values))train= df.drop(['molecule_name', 'scalar_coupling_constant'], axis=1)
test= df['scalar_coupling_constant']feature_train, feature_test, target_train, target_test= train_test_split(train, test, test_size=0.12)
total feature training features:  4099169
total feature testing features:  558978
total target training features:  4099169
total target testing features:  558978
train_data = lightgbm.Dataset(feature_train, label=target_train)
test_data = lightgbm.Dataset(feature_test, label=target_test)#Create some parameters:

n_fold = 7 folds = KFold(n_splits=n_fold, shuffle=True, random_state=4) #The formatting below is a bit messy but please be aware of proper indentation. Copy paste into medium is messing up proper indentationparameters = {'num_leaves': 250,
'min_child_samples': 75,
'objective': 'regression',
'max_depth': 24,
'learning_rate': 0.001,
"boosting_type": "gbdt",
"subsample_freq": 3,
"subsample": 0.9,
"bagging_seed": 15,
"metric": 'mae',
"verbosity": -1,
'reg_alpha': 0.01,
'reg_lambda': 0.3,
'colsample_bytree': 1.0}#Now to run the model:

model = lightgbm.train(parameters,                        train_data, valid_sets=test_data,                        num_boost_round=5000,                        early_stopping_rounds=100)  #I should have used 10,000 but 5000 was still great#Time to visualize the new feature importance change:
ax = lightgbm.plot_importance(model, max_num_features=40, figsize=(15,15))
plt.show()
We see virtually no change compare to the previous LGBM model I made
normal_feature=df.drop(['molecule_name', 'scalar_coupling_constant'], axis=1) target= df['scalar_coupling_constant']x= normal_feature.values
min_max_scaler = preprocessing.MinMaxScaler()x_scaled = min_max_scaler.fit_transform(x) #this gave an array instead of a dataframe. Now to convert this array into a dataframe and then properly change all column names back to the originalnf = pd.DataFrame(x_scaled)#This will reassign the names. It would have been easier to create a loop function but since I did it once already, I just copy pasted and added the additional dummy variables innormfeat= nf.rename(columns={0: 'atom_index_0', 1:'atom_index_1', 2: 'type', 3:'potential_energy', 4: 'X', 5: 'Y', 6: 'Z',                        7: 'XX_atom1', 8: 'YX_atom1', 9: 'ZX_atom1', 10: 'XY_atom1', 11: 'YY_atom1',                         12: 'ZY_atom1',13: 'XZ_atom1', 14: 'YZ_atom1', 15: 'ZZ_atom1',        16: 'XX', 17: 'YX', 18: 'ZX', 19: 'XY', 20: 'YY', 21:'ZY', 22:'XZ', 23:'YZ', 24:'ZZ',        25: 'mulliken_charge_atom1', 26: 'mulliken_charge', 27: 'type_scc',                               28: 'fc', 29: 'sd', 30:'pso',31: 'dso', 32: 'atom_atom1_structure',        33:'x_atom1_structure', 34:'y_atom1_structure', 35:'z_atom1_structure', 36: 'atom', 37:'x', 38:'y', 39:'z'                       })
features= normfeat 
target= df[[‘scalar_coupling_constant’]] feature_train, feature_test, target_train, target_test= train_test_split(features, target, test_size=0.1)#Then to convert it all to numpy array for easier calculations:

feature_array= feature_train.to_numpy() 
target_array= target_train.to_numpy()
linear= LinearRegression()
mse= cross_val_score(linear, feature_array, target_array, scoring='neg_mean_squared_error', cv= 20)
mean_mse= np.mean(mse)mse= -6.683008610663098e-09
target_test= target_test.reset_index()target_test=target_test.drop(['index'], axis=1)
#Only run the above codes once or it will give you errortarget_test=target_test.rename(columns={'scalar_coupling_constant': 'Actual Test Scalar'})
ridge= Ridge()
ridgereg= GridSearchCV(ridge, param_grid=parameters, scoring='neg_mean_squared_error', cv= 15)
ridgereg.fit(feature_array, target_array)
print('ridge param: ', ridgereg.best_params_)
print('ridge score: ', ridgereg.best_score_)
ridgereg#The answers:
ridge param:  {'alpha': 1e-25}
ridge score:  -6.683015092064162e-09#Again, sorry about the formatting.GridSearchCV(cv=15, error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-25, 1e-20, 1e-15, 1e-10, 1e-05, 0.01, 1,
                                   10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)
# Finally to fit the best ridge parameters to our training set:ridgereg= GridSearchCV(ridge, param_grid=parameters, scoring='neg_mean_squared_error', cv= 20) ridgereg.fit(feature_array, target_array)#To check the actual score:
ridgereg.score(feature_test, target_test)-6.836977895342504e-09 #this is a very small mean squared error.
ridgepredict=ridgereg.predict(feature_test)
actualtest=np.array(target_test)

plt.rcParams["figure.figsize"] = (8, 8) 
fig, ax = plt.subplots() 
ax.scatter(actualtest, ridgepredict) 
ax.set(title="Ridge Actual vs Predict") 
ax.set(xlabel="Actual", ylabel="Predict");
This shows how well my model is predicting against the actual test.
#To save the ridge data into a dataframe
ridgedata= pd.concat([target_test, pd.DataFrame(ridgepredict)], axis=1)
ridgecomparison= ridgedata.rename(columns={0:'Predicted Ridge Scalar'})
lasso= Lasso()
lassoreg= GridSearchCV(lasso, param_grid=parameters, scoring='neg_mean_squared_error', cv= 20)
lassoreg.fit(feature_array, target_array)lassoreg.score(feature_test, target_test)-1.3148967532843286e-05 #the score is worse than Ridge. This showed that Ridge is better for this problem than Lasso#To create the Lasso graph:
lassopredict=lassoreg.predict(feature_test)
plt.rcParams["figure.figsize"] = (8, 8)
fig, ax = plt.subplots()
ax.scatter(actualtest, lassopredict)
ax.set(title="Lasso Actual vs Predict")
ax.set(xlabel="Actual", ylabel="Predict");
The same issue persist.
lassodata= pd.concat([target_test, pd.DataFrame(lassopredict)], axis=1) 
lassocomparison= lassodata.rename(columns={0:'Predicted Lasso Scalar'})#Combining the test dataframe with the Ridge and Lasso and then saving it as a CSV to check with Tableauaccuracy_check= pd.concat([ridgecomparison, lassocomparison], axis=1)
accuracy_check= accuracy_check.loc[:,~accuracy_check.columns.duplicated()] #this drops any duplicated columnsaccuracy_check.to_csv('accuracy_check.csv', index=False)# You can run the following to see if it saved properly:
acc_chk= pd.read_csv('accuracy_check.csv')
acc_chk.head()
The Tableau visual (sorry, there is no codes for this)
earlystop=keras.callbacks.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=False)terminate= keras.callbacks.callbacks.TerminateOnNaN()adadelta= optimizers.Adadelta(learning_rate=1.0, rho=0.95)
m4 = Sequential() m4.add(Dense(64, input_dim=40, activation='relu')) 
m4.add(Dense(32, activation='relu')) 
m4.add(Dense(16, activation='relu')) 
m4.add(Dense(8, activation='relu')) 
m4.add(Dense(4, activation='relu')) 
m4.add(Dense(1, activation='linear')) 
m4.summary()Model: "sequential_4" _________________________________________________________________ Layer (type)                 Output Shape              Param #    ================================================================= dense_19 (Dense)             (None, 64)                2624       _________________________________________________________________ dense_20 (Dense)             (None, 32)                2080       _________________________________________________________________ dense_21 (Dense)             (None, 16)                528        _________________________________________________________________ dense_22 (Dense)             (None, 8)                 136        _________________________________________________________________ dense_23 (Dense)             (None, 4)                 36         _________________________________________________________________ dense_24 (Dense)             (None, 1)                 5          ================================================================= Total params: 5,409 
Trainable params: 5,409 
Non-trainable params: 0#early callback
m4.compile(optimizer=adadelta, loss='mean_squared_error', metrics=['mae', 'acc'])
his4=m4.fit(feature_array, target_array, validation_split=0.3, verbose=1, callbacks=[earlystop, terminate], 
            epochs=300, batch_size=5000)
#with early and termination callback established 
acc = his4.history['acc'] 
val_acc = his4.history['val_acc'] 
loss = his4.history['loss'] 
val_loss = his4.history['val_loss'] 
epochs = range(len(acc)) 
plt.plot(epochs, acc, 'r', label='Training acc') 
plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') 
plt.ylabel('accuracy')   
plt.xlabel('epoch') 
plt.legend() plt.figure() 
plt.plot(epochs, loss, 'r', label='Training loss') 
plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') 
plt.ylabel('loss')   
plt.xlabel('epoch') 
plt.legend() 
plt.show()
The convergence occurs due to the Loss but not accuracy. The accuracy is far too low to be valid.
m4.compile(optimizer=adadelta, loss='mean_squared_error', metrics=['mae', 'acc'])his4=m4.fit(feature_array, target_array, validation_split=0.3, verbose=1,
            epochs=300, batch_size=5000)#The graphacc = his4.history['acc'] 
val_acc = his4.history['val_acc'] 
loss = his4.history['loss'] 
val_loss = his4.history['val_loss'] 
epochs = range(len(acc)) 
plt.plot(epochs, acc, 'r', label='Training acc') 
plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') 
plt.ylabel('accuracy')   
plt.xlabel('epoch') 
plt.legend() plt.figure() 
plt.plot(epochs, loss, 'r', label='Training loss') 
plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') 
plt.ylabel('loss')   
plt.xlabel('epoch') 
plt.legend() 
plt.show()
With no early callback
acc = his4.history['mae'] 
val_acc = his4.history['val_mae'] 
loss = his4.history['loss'] 
val_loss = his4.history['val_mae'] 
epochs = range(len(mae)) 
plt.plot(epochs, mae, 'r', label='Training mae') 
plt.plot(epochs, val_mae, 'b', label='Validation mae') plt.title('Training and validation mae') 
plt.ylabel('mae')   
plt.xlabel('epoch') 
plt.legend() plt.figure() 
plt.plot(epochs, loss, 'r', label='Training loss') 
plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') 
plt.ylabel('loss')   
plt.xlabel('epoch') 
plt.legend() 
plt.show()
Mean Absolute Error

Comments

Popular posts from this blog

SSO — WSO2 API Manager and Keycloak Identity Manager

Garbage Collectors - Serial vs. Parallel vs. CMS vs. G1 (and what’s new in Java 8)

Recommendation System Using Word2Vec with Python