9. scikit-learn

9.1. Loading Sample Datasets

Code Listing 9.21. Loading Sample Datasets
from sklearn import datasets
from sklearn.model_selection import train_test_split


dataset = datasets.load_iris()
# dataset = datasets.load_breast_cancer()
# dataset = datasets.load_diabetes()
# dataset = datasets.load_boston()
# dataset = datasets.load_wine()

features = dataset.data
labels = dataset.target

data = train_test_split(features, labels, test_size=0.25, random_state=0)
features_train = data[0]
features_test = data[1]
labels_train = data[2]
labels_test = data[3]


# Najczęściej w dokumentacji można znaleźć
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=0)

9.2. Fit and Predict

Code Listing 9.22. Fit and Predict
from sklearn.tree import DecisionTreeClassifier


features = [
    (5.1, 3.5, 1.4, 0.2),  # setosa
    (7.0, 3.2, 4.7, 1.4),  # versicolor
    (6.3, 3.3, 6.0, 2.5),  # virginica
    (4.9, 3.0, 1.4, 0.2),  # setosa
    (4.7, 3.2, 1.3, 0.2),  # setosa
    (6.4, 3.2, 4.5, 1.5),  # versicolor
    (7.1, 3.0, 5.9, 2.1),  # virginica
    (6.9, 3.1, 4.9, 1.5),  # versicolor
    (5.8, 2.7, 5.1, 1.9),  # virginica
]

labels = [
    'setosa',
    'versicolor',
    'virginica',
    'setosa',
    'setosa',
    'versicolor',
    'virginica',
    'versicolor',
    'virginica'
]


model = DecisionTreeClassifier()
model.fit(features, labels)

to_predict = [
    (5.6, 2.3, 4.1, 2.9)
]

output = model.predict(to_predict)
print(output)
# ['virginica']

9.3. Classifier

Code Listing 9.23. Classifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, datasets


dataset = datasets.load_iris()
features = dataset.data
labels = dataset.target

data = train_test_split(features, labels, test_size=0.25, random_state=0)

features_train = data[0]
features_test = data[1]
labels_train = data[2]
labels_test = data[3]


model = KNeighborsClassifier(n_neighbors=5)
model.fit(features_train, labels_train)
labels_predicted = model.predict(features_test)

accuracy = metrics.accuracy_score(labels_test, labels_predicted)
print(accuracy)
# 0.9736842105263158

9.4. Feature Selection

from sklearn.feature_selection import VarianceThreshold

features = [
    [0, 0, 1],
    [0, 1, 0],
    [1, 0, 0],
    [0, 1, 1],
    [0, 1, 0],
    [0, 1, 1]
]

# Remove all features below 80% change variance in the samples
sel = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))

sel.fit_transform(features)
# array([[0, 1],
#        [1, 0],
#        [0, 0],
#        [1, 1],
#        [1, 0],
#        [1, 1]])
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

iris = load_iris()
features = iris.data
labels = iris.target

features.shape
# (150, 4)

best_features = SelectKBest(chi2, k=2).fit_transform(features, labels)
# array([[1.4, 0.2],
#        [1.4, 0.2],
#        ...
#        [5.4, 2.3],
#        [5.1, 1.8]])

best_features.shape
# (150, 2)

9.5. Evaluation

9.5.1. Score

Code Listing 9.24. Score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split


dataset = datasets.load_iris()
features = dataset.data
labels = dataset.target

data = train_test_split(features, labels, test_size=0.25, random_state=0)

features_train = data[0]
features_test = data[1]
labels_train = data[2]
labels_test = data[3]

model = KNeighborsClassifier()
model.fit(features_train, labels_train)
model.predict(features_test)

score = model.score(features_test, labels_test)
accuracy = score * 100  # in percent

print(f'Accuracy: {accuracy:.2f}%')
# Accuracy: 97.37%

9.5.2. Cross Validation

Code Listing 9.25. Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split


dataset = datasets.load_iris()
features = dataset.data
labels = dataset.target

data = train_test_split(features, labels, test_size=0.25, random_state=0)

features_train = data[0]
features_test = data[1]
labels_train = data[2]
labels_test = data[3]

model = KNeighborsClassifier()
scores = cross_val_score(model, features_train, labels_train, cv=5)
accuracy = scores.mean() * 100  # percent
stdev = scores.std() * 100      # percent

print(f'Accuracy: {accuracy:.2f}% (+/- {stdev:.2f}%)')
# Accuracy: 95.49% (+/- 4.98%)

9.6. Label Encoder

Code Listing 9.26. Label Encoder
from sklearn import preprocessing


features = [
    (5.1, 3.5, 1.4, 0.2),  # setosa
    (7.0, 3.2, 4.7, 1.4),  # versicolor
    (6.3, 3.3, 6.0, 2.5),  # virginica
    (4.9, 3.0, 1.4, 0.2),  # setosa
    (4.7, 3.2, 1.3, 0.2),  # setosa
    (6.4, 3.2, 4.5, 1.5),  # versicolor
    (7.1, 3.0, 5.9, 2.1),  # virginica
    (6.9, 3.1, 4.9, 1.5),  # versicolor
    (5.8, 2.7, 5.1, 1.9),  # virginica
]

labels_names = [
    'setosa',
    'versicolor',
    'virginica',
    'setosa',
    'setosa',
    'versicolor',
    'virginica',
    'versicolor',
    'virginica'
]

label_encoder = preprocessing.LabelEncoder()
labels = label_encoder.fit_transform(labels_names)
# array([0, 1, 2, 0, 0, 1, 2, 1, 2])

list(label_encoder.classes_)
# ['setosa', 'versicolor', 'virginica']

# 0: setosa
# 1: versicolor
# 2: virginica

list(label_encoder.inverse_transform([2, 2, 1]))
# ['virginica', 'virginica', 'setosa']

9.7. Writing Own Classifier

9.7.1. Random Classifier

import random


class RandomNeighborClassifer:
    def fit(self, features, labels):
        self.features_train = features
        self.labels_train = labels

    def predict(self, features_test):
        predictions = []

        for row in features_test:
            label = random.choice(self.labels_train)
            predictions.append(label)

        return predictions

Accuracy for Iris dataset: 0.346666666667

9.8. Zadania praktyczne

9.8.1. Nearest Neighbor Classifier

  1. Napisz klasyfikator najbliższego sąsiada

  2. Podziel dane treningowe i testowe pół-na-pół

  3. Dla zbioru Iris ma osiągać accuracy na poziomie powyżej 90%

  4. Klasa NearestNeighborClassifier powinna mieć interfejs zgodny z scikit-learn:

    • .fit() - do uczenia funkcji
    • .predict() - do predykcji
  5. Do porównania użyj accuracy = metrics.accuracy_score(labels_test, labels_predicted)

Hints:
  • Dla każdego feature sprawdzasz jaka jest najmniejsza odległość

  • Wybierasz najmniejszą odległość ze wszystkich

  • Do obliczania odległości skorzystaj z algorytmu Euclidesa.

  • from scipy.spatial.distance import euclidean as euclidean_distance

  • from sklearn import metrics
    from scipy.spatial.distance import euclidean as euclidean_distance
    from sklearn.model_selection import train_test_split
    from sklearn import datasets
    
    
    class NearestNeighborClassifier:
        def fit(self, features, labels):
            raise NotImplementedError
    
        def predict(self, features_test):
            raise NotImplementedError
    
    dataset = datasets.load_iris()
    features = dataset.data
    labels = dataset.target
    
    data = train_test_split(features, labels, test_size=0.25, random_state=0)
    
    features_train = data[0]
    features_test = data[1]
    labels_train = data[2]
    labels_test = data[3]
    
    model = NearestNeighborClassifier()
    model.fit(features_train, labels_train)
    predictions = model.predict(features_test)
    accuracy = metrics.accuracy_score(labels_test, predictions)
    
    print(accuracy)
    
About:
  • Filename: ml-sklearn-classifier.py
  • Lines of code to write: 15 lines
  • Estimated time of completion: 30 min

9.8.2. Porównanie classifierów

  • Pobierz dane Brest Cancer Dataset (datasets.load_breast_cancer())
  • Podziel zestaw na dane testowe (15%) i dane treningowe (85%) i ustaw random_state=0
  • Dla danych przeprowadź analizę wykorzystując różne modele danych
  • Wyświetl nazwę, dokładność oraz odchylenie standardowe modelu
Nearest Neighbors | Accuracy: 71.18% (+/- 3.78%)
       Linear SVM | Accuracy: 76.04% (+/- 2.79%)
          RBF SVM | Accuracy: 64.24% (+/- 0.22%)
 Gaussian Process | Accuracy: 68.58% (+/- 3.07%)
    Decision Tree | Accuracy: 68.24% (+/- 4.53%)
    Random Forest | Accuracy: 73.96% (+/- 3.28%)
       Neural Net | Accuracy: 65.28% (+/- 2.75%)
         AdaBoost | Accuracy: 72.57% (+/- 4.16%)
      Naive Bayes | Accuracy: 73.62% (+/- 2.89%)
              QDA | Accuracy: 73.97% (+/- 4.42%)
Hints:
classifiers = [
    {'name': "Nearest Neighbors", 'model': KNeighborsClassifier()},
    {'name': "Linear SVM",        'model': SVC(kernel="linear")},
    {'name': "RBF SVM",           'model': SVC(kernel="rbf")},
    {'name': "Gaussian Process",  'model': GaussianProcessClassifier()},
    {'name': "Decision Tree",     'model': DecisionTreeClassifier()},
    {'name': "Random Forest",     'model': RandomForestClassifier()},
    {'name': "Neural Net",        'model': MLPClassifier(max_iter=1500)},
    {'name': "AdaBoost",          'model': AdaBoostClassifier()},
    {'name': "Naive Bayes",       'model': GaussianNB()},
    {'name': "QDA",               'model': QuadraticDiscriminantAnalysis()},
]
Zadanie z gwiazdką:
 
  • Zrównoleglij uruchamianie predykcji za pomocą modułu threading oraz architektury opartej na Workerach.
  • Wyświetl posortowaną malejąco listę wg. dokładności
About:
  • Filename: ml-sklearn-comparision.py
  • Lines of code to write: 15 lines
  • Estimated time of completion: 20 min