In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import KFold
In [2]:
train = pd.read_csv("inputs/train_from_f1.csv")
In [18]:
train_raw_y = pd.read_csv("inputs/train.csv")['scalar_coupling_constant']
In [19]:
train_raw_y.shape
Out[19]:
In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
In [22]:
train['type'] = train['type'].astype('category')
train['atom_0'] = train['atom_0'].astype('category')
train['atom_1'] = train['atom_1'].astype('category')
In [23]:
train.shape
Out[23]:
In [ ]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 511)
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)) :
train_X, train_y = train.ix[trn_idx], train_raw_y[trn_idx]
valid_X, valid_y = train.ix[val_idx], train_raw_y[val_idx]
params = {'learning_rate': 0.01,
'max_depth': 16,
'boosting': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
'is_training_metric': True,
'num_leaves': 144,
'feature_fraction': 0.9,
'bagging_fraction': 0.7,
'bagging_freq': 5,
'seed':2019}
train_ds = lgb.Dataset(train_X.drop(['id','molecule_name'], axis =1), label=train_y)
valid_ds = lgb.Dataset(valid_X.drop(['id','molecule_name'], axis =1), label=valid_y)
model = lgb.train(params, train_ds, 1000, valid_ds, verbose_eval=10, early_stopping_rounds=100)
# sub_preds += model.predict(x_test) / folds.n_splits
oof_preds[val_idx] = model.predict(val_x)