from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(
random_state=0,
penalty="l2",
class_weight="balanced", # or dict {0: 0.1, 1: 0.9}
).fit(X, y,
# sample_weight= # array , n_samples, for each row.
)
clf.predict(X[:2, :])
clf.predict_proba(X[:2, :])
clf.score(X, y)
clf.decision_function(X)
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
n_informative=2, n_redundant=0,
random_state=0, shuffle=False)
clf = RandomForestClassifier(
max_depth=2,
random_state=0,
n_estimators=100,
class_weight= # "balanced", "balanced_subsample" or {0: 0.1, 1: 0.9 } weights per class
)
clf.fit(X, y,
# sample_weight= # array , n_samples, for each row.
)
print(clf.predict([[0, 0, 0, 0]]))
In [16]: pd.DataFrame(X).corr()
Out[16]:
0 1 2 3
0 1.000000 0.065124 0.026765 0.028988
1 0.065124 1.000000 0.031176 -0.026317
2 0.026765 0.031176 1.000000 -0.006788
3 0.028988 -0.026317 -0.006788 1.000000
In [17]: clf.feature_importances_
Out[17]: array([0.14205973, 0.76664038, 0.0282433 , 0.06305659])
print(clf.predict_log_proba([[0, 0, 0, 0]]))
print(clf.predict_proba([[0, 0, 0, 0]]))
print(clf.predict([[0, 0, 0, 0]]))
In [18]: print(clf.predict_log_proba([[0, 0, 0, 0]]))
...:
[[-1.72562562 -0.19608985]]
In [19]: print(clf.predict_proba([[0, 0, 0, 0]]))
...:
[[0.17806162 0.82193838]]
In [20]: from math import log
In [21]: log(0.82193838)
Out[21]: -0.19608985023951067
In [22]: print(clf.predict([[0, 0, 0, 0]]))
[1]
y_true = y
y_pred = clf.pred(X)
metrics.accuracy_score(y_true, y_pred)
Out[29]: 0.925
metrics.confusion_matrix(y_true, y_pred)
Out[30]:
array([[434, 70],
[ 5, 491]])
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred, pos_label=1)
metrics.auc(fpr, tpr)
# Out[32]: 0.9255152329749103
metrics.log_loss(y_true, y_pred,)
Out[33]: 2.590464201438415
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
Cross Validation#
>>> import numpy as np
>>> from sklearn.model_selection import (
KFold,
StratifiedKFold, # preserves percentage of samples per class.
)
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
>>> y = np.array([1, 2, 3, 4])
>>> kf = KFold(n_splits=2)
>>> kf.get_n_splits(X)
2
>>> print(kf)
KFold(n_splits=2, random_state=None, shuffle=False)
>>> for train_index, test_index in kf.split(X):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [2 3] TEST: [0 1]
TRAIN: [0 1] TEST: [2 3]
from sklearn import utils
utils.class_weight.compute_class_weight()
utils.class_weight.compute_sample_weight()
Other handy references#
https://ml-cheatsheet.readthedocs.io/