Baseline/Ramp

From Challenge4Cancer
Jump to: navigation, search

Saturday February 13th 2016, La Paillasse: Rapid Analytics and Model Prototyping (RAMP).

alternative textuelle
40 data scientists competing for the best model for cancer mortality

[back: http://wiki.epidemium.cc/wiki/Baseline] A big thank you to Djalel Benbouzid from UPMC for the organisation of such a wonderful event. A giant boost against cancer!

Starting-Kit

The starting kit is here

Forty data scientists participated in that one day modelling competition.

Model #1

Bravo to mohamed_zenadi for his sub_two model! Using variable about Aids, income and alcohol consumption and a linear combination of three models he was able to get a much better prediction of cancer mortality than other competitors! Sum of square error: 333.

Of note a model such as Y = a + b gender + c year leads to an error of 2500, and adding variables and using a GLM leads to an error of 800. So 333 is very impressive.

Preparation: selecting and shaping the data

  1. import pandas as pd
  2. from numpy import NaN
  3. class FeatureExtractor(object):
  4. core_cols = ['Year']
  5. region_cols = ['RegionType', 'Part of', 'Region']
  6. categ_cols = ['Gender', 'MainOrigin', 'cancer_type'] + region_cols
  7. additional_cols = ['HIV_15_49', 'income', 'alcool_consumption']
  8. def __init__(self):
  9. self.more_cols = set()
  10. pass
  11. def fit(self, X_df, y_array):
  12. pass
  13. def transform(self, X_df):
  14. ret = X_df[self.core_cols].copy()
  15. # dummify the categorical variables
  16. train_data = True
  17. if self.more_cols:
  18. train_data = False
  19. extra_cols = set()
  20. for col in self.categ_cols:
  21. c = pd.get_dummies(X_df[col], prefix=col[:3])
  22. if train_data:
  23. self.more_cols.update({co for co in c.columns.values})
  24. else:
  25. extra_cols.update({co for co in c.columns.values})
  26. ret = ret.join(c)
  27. if not train_data:
  28. for c in self.more_cols:
  29. if c not in extra_cols:
  30. ret[c] = NaN
  31. for c in extra_cols:
  32. if c not in self.more_cols:
  33. ret = ret.drop(c, 1)
  34. print ret.shape
  35. # add extra information
  36. for col in self.additional_cols:
  37. ret[col] = X_df[col]
  38. return ret.values

Regression: calibrating the model

  1. import pandas as pd
  2. from sklearn.ensemble import *
  3. from sklearn.preprocessing import *
  4. from sklearn import linear_model
  5. from sklearn import svm
  6. from sklearn.preprocessing import Imputer
  7. from sklearn.base import BaseEstimator
  8. from sklearn.pipeline import make_pipeline
  9. import numpy as np
  10. class Regressor(BaseEstimator):
  11. def __init__(self):
  12. self.clf1 = make_pipeline(Imputer(),
  13. GradientBoostingRegressor(n_estimators=5000, max_depth=8))
  14. self.clf2 = make_pipeline(Imputer(),
  15. MaxAbsScaler(),
  16. ExtraTreesRegressor(n_estimators=5000, criterion='mse', max_depth=8,
  17. min_samples_split=10, min_samples_leaf=1,
  18. min_weight_fraction_leaf=0.0,
  19. max_features='auto', max_leaf_nodes=None, bootstrap=False,
  20. oob_score=False,
  21. n_jobs=1, random_state=42, verbose=0, warm_start=True))
  22. self.clf3 = make_pipeline(Imputer(),
  23. svm.LinearSVR())
  24. self.clf = linear_model.LinearRegression()
  25. def fit(self, X_t, y_t):
  26. self.X_t = X_t
  27. self.y_t = y_t
  28. self.clf1.fit(X_t, y_t)
  29. self.clf2.fit(X_t, y_t)
  30. self.clf3.fit(X_t, y_t)
  31. y1 = self.clf1.predict(self.X_t)
  32. y2 = self.clf2.predict(self.X_t)
  33. y3 = self.clf3.predict(self.X_t)
  34. d = pd.DataFrame({'y1': y1, 'y2': y2, 'y3': y3}).values
  35. self.clf.fit(d, self.y_t)
  36. def predict(self, X_cv):
  37. r1 = self.clf1.predict(X_cv)
  38. r2 = self.clf2.predict(X_cv)
  39. r3 = self.clf3.predict(X_cv)
  40. r = pd.DataFrame({'y1': r1, 'y2': r2, 'y3': r3}).values
  41. return self.clf.predict(r)

Model #2

Bravo to alexander_mikheev for his Alex4 model! Within the top 5 he has used many more X variables:

(Sum of square error: 760; many models led to errors around 780-820, and then gradually to more than 2000)

Preparation

  1. import pandas as pd
  2. from sklearn import preprocessing
  3. class FeatureExtractor(object):
  4. core_cols = ['Year', 'income', 'fastfood_spending', 'alcool_consumption', 'nickel_emission', 'chromium_emission']
  5. core_cols += ['measles_vacc_1', 'polio_vacc', 'tetanus_vacc', 'diphteria_vacc']
  6. core_cols += ['hepb_vacc', 'shale_oil', 'transplants_prevalence', 'cadmium_export', 'companies_indus']
  7. region_cols = ['RegionType', 'Part of', 'Region']
  8. categ_cols = ['Gender', 'Age', 'MainOrigin', 'cancer_type']
  9. additional_cols = ['HIV_15_49']
  10. def __init__(self):
  11. pass
  12. def fit(self, X_df, y_array):
  13. pass
  14. def transform(self, X_df):
  15. ret = X_df[self.core_cols].copy()
  16. # dummify the categorical variables
  17. for col in self.categ_cols:
  18. ret = ret.join(pd.get_dummies(X_df[col], prefix=col[:3]))
  19. # add extra information
  20. for col in self.additional_cols:
  21. ret[col] = X_df[col]
  22. #ret = ret.dropna()
  23. return ret.values

Regression

  1. import pandas as pd
  2. from sklearn.ensemble import *
  3. from sklearn.naive_bayes import GaussianNB
  4. from sklearn.preprocessing import Imputer, MinMaxScaler, PolynomialFeatures
  5. from sklearn.base import BaseEstimator
  6. from sklearn.pipeline import make_pipeline
  7. class Regressor(BaseEstimator):
  8. def __init__(self):
  9. self.clf = make_pipeline(
  10. Imputer(),
  11. MinMaxScaler(),
  12. GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.7))
  13. def fit(self, X, y):
  14. return self.clf.fit(X, y)
  15. def predict(self, X):
  16. return self.clf.predict(X)