3.1. scikit-learn

3.1.1. Loading Sample Datasets

Code 3.102. Loading Sample Datasets
# doctest: +SKIP_FILE

from sklearn import datasets
from sklearn.model_selection import train_test_split


dataset = datasets.load_iris()
# dataset = datasets.load_breast_cancer()
# dataset = datasets.load_diabetes()
# dataset = datasets.load_boston()
# dataset = datasets.load_wine()

features = dataset.data
labels = dataset.target

data = train_test_split(features, labels, test_size=0.25, random_state=0)
features_train = data[0]
features_test = data[1]
labels_train = data[2]
labels_test = data[3]


# This is the most frequent form
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=0)

3.1.2. Fit and Predict

Code 3.103. Fit and Predict
# doctest: +SKIP_FILE

from sklearn.tree import DecisionTreeClassifier


features = [
    (5.1, 3.5, 1.4, 0.2),  # setosa
    (7.0, 3.2, 4.7, 1.4),  # versicolor
    (6.3, 3.3, 6.0, 2.5),  # virginica
    (4.9, 3.0, 1.4, 0.2),  # setosa
    (4.7, 3.2, 1.3, 0.2),  # setosa
    (6.4, 3.2, 4.5, 1.5),  # versicolor
    (7.1, 3.0, 5.9, 2.1),  # virginica
    (6.9, 3.1, 4.9, 1.5),  # versicolor
    (5.8, 2.7, 5.1, 1.9),  # virginica
]

labels = [
    'setosa',
    'versicolor',
    'virginica',
    'setosa',
    'setosa',
    'versicolor',
    'virginica',
    'versicolor',
    'virginica'
]


model = DecisionTreeClassifier()
model.fit(features, labels)

to_predict = [
    (5.6, 2.3, 4.1, 2.9)
]

result = model.predict(to_predict)
print(result)
# ['virginica']

3.1.3. Classifier

Code 3.104. Classifier
# doctest: +SKIP_FILE

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, datasets


dataset = datasets.load_iris()
features = dataset.data
labels = dataset.target

data = train_test_split(features, labels, test_size=0.25, random_state=0)

features_train = data[0]
features_test = data[1]
labels_train = data[2]
labels_test = data[3]


model = KNeighborsClassifier(n_neighbors=5)
model.fit(features_train, labels_train)
labels_predicted = model.predict(features_test)

accuracy = metrics.accuracy_score(labels_test, labels_predicted)
print(accuracy)
# 0.9736842105263158

3.1.4. Feature Selection

from sklearn.feature_selection import VarianceThreshold

features = [
    [0, 0, 1],
    [0, 1, 0],
    [1, 0, 0],
    [0, 1, 1],
    [0, 1, 0],
    [0, 1, 1]
]

# Remove all features below 80% change variance in the samples
sel = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))

sel.fit_transform(features)
# array([[0, 1],
#        [1, 0],
#        [0, 0],
#        [1, 1],
#        [1, 0],
#        [1, 1]])
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

iris = load_iris()
features = iris.data
labels = iris.target

features.shape
# (150, 4)

best_features = SelectKBest(chi2, k=2).fit_transform(features, labels)
# array([[1.4, 0.2],
#        [1.4, 0.2],
#        ...
#        [5.4, 2.3],
#        [5.1, 1.8]])

best_features.shape
# (150, 2)

3.1.5. Evaluation

3.1.6. Score

Code 3.105. Score
# doctest: +SKIP_FILE

from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split


dataset = datasets.load_iris()
features = dataset.data
labels = dataset.target

data = train_test_split(features, labels, test_size=0.25, random_state=0)

features_train = data[0]
features_test = data[1]
labels_train = data[2]
labels_test = data[3]

model = KNeighborsClassifier()
model.fit(features_train, labels_train)
model.predict(features_test)

score = model.score(features_test, labels_test)
accuracy = score * 100  # in percent

print(f'Accuracy is {accuracy:.2f}%')
# Accuracy is 97.37%

3.1.7. Cross Validation

Code 3.106. Cross Validation
# doctest: +SKIP_FILE

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split


dataset = datasets.load_iris()
features = dataset.data
labels = dataset.target

data = train_test_split(features, labels, test_size=0.25, random_state=0)

features_train = data[0]
features_test = data[1]
labels_train = data[2]
labels_test = data[3]

model = KNeighborsClassifier()
scores = cross_val_score(model, features_train, labels_train, cv=5)
accuracy = scores.mean() * 100  # percent
stdev = scores.std() * 100      # percent

print(f'Accuracy is {accuracy:.2f}% (+/- {stdev:.2f}%)')
# Accuracy is 95.49% (+/- 4.98%)

3.1.8. Label Encoder

Code 3.107. Label Encoder
# doctest: +SKIP_FILE

from sklearn import preprocessing


features = [
    (5.1, 3.5, 1.4, 0.2),  # setosa
    (7.0, 3.2, 4.7, 1.4),  # versicolor
    (6.3, 3.3, 6.0, 2.5),  # virginica
    (4.9, 3.0, 1.4, 0.2),  # setosa
    (4.7, 3.2, 1.3, 0.2),  # setosa
    (6.4, 3.2, 4.5, 1.5),  # versicolor
    (7.1, 3.0, 5.9, 2.1),  # virginica
    (6.9, 3.1, 4.9, 1.5),  # versicolor
    (5.8, 2.7, 5.1, 1.9),  # virginica
]

labels_names = [
    'setosa',
    'versicolor',
    'virginica',
    'setosa',
    'setosa',
    'versicolor',
    'virginica',
    'versicolor',
    'virginica'
]

label_encoder = preprocessing.LabelEncoder()
labels = label_encoder.fit_transform(labels_names)
# array([0, 1, 2, 0, 0, 1, 2, 1, 2])

list(label_encoder.classes_)
# ['setosa', 'versicolor', 'virginica']

# 0: setosa
# 1: versicolor
# 2: virginica

list(label_encoder.inverse_transform([2, 2, 1]))
# ['virginica', 'virginica', 'setosa']

3.1.9. Writing Own Classifier

3.1.10. Random Classifier

import random


class RandomNeighborClassifier:
    def fit(self, features, labels):
        self.features_train = features
        self.labels_train = labels

    def predict(self, features_test):
        predictions = []

        for row in features_test:
            label = random.choice(self.labels_train)
            predictions.append(label)

        return predictions

Accuracy for Iris dataset: 0.346666666667

3.1.11. Zadania praktyczne

3.1.12. Nearest Neighbor Classifier

  • Assignment: Nearest Neighbor Classifier

  • Complexity: medium

  • Lines of code: 15 lines

  • Time: 21 min

English:

TODO: English Translation X. Run doctests - all must succeed

Polish:
  1. Napisz klasyfikator najbliższego sąsiada

  2. Podziel dane treningowe i testowe pół-na-pół

  3. Dla zbioru Iris ma osiągać accuracy na poziomie powyżej 90%

  4. Klasa NearestNeighborClassifier powinna mieć interfejs zgodny z scikit-learn:

    1. .fit() - do uczenia funkcji

    2. .predict() - do predykcji

  5. Do porównania użyj accuracy = metrics.accuracy_score(labels_test, labels_predicted)

  6. Uruchom doctesty - wszystkie muszą się powieść

Hints:
  • Dla każdego feature sprawdzasz jaka jest najmniejsza odległość

  • Wybierasz najmniejszą odległość ze wszystkich

  • Do obliczania odległości skorzystaj z algorytmu Euklidesa.

  • from scipy.spatial.distance import euclidean as euclidean_distance

  • from sklearn import metrics
    from scipy.spatial.distance import euclidean as euclidean_distance
    from sklearn.model_selection import train_test_split
    from sklearn import datasets
    
    
    class NearestNeighborClassifier:
        def fit(self, features, labels):
            raise NotImplementedError
    
        def predict(self, features_test):
            raise NotImplementedError
    
    dataset = datasets.load_iris()
    features = dataset.data
    labels = dataset.target
    
    data = train_test_split(features, labels, test_size=0.25, random_state=0)
    
    features_train = data[0]
    features_test = data[1]
    labels_train = data[2]
    labels_test = data[3]
    
    model = NearestNeighborClassifier()
    model.fit(features_train, labels_train)
    predictions = model.predict(features_test)
    accuracy = metrics.accuracy_score(labels_test, predictions)
    
    print(accuracy)
    

3.1.13. Sklearn Classifier Compare

  • Assignment: Sklearn Classifier Compare

  • Complexity: medium

  • Lines of code: 15 lines

  • Time: 21 min

English:
TODO: English Translation

Run doctests - all must succeed

Polish:
  1. Pobierz dane Brest Cancer Dataset (datasets.load_breast_cancer())

  2. Podziel zestaw na dane testowe (15%) i dane treningowe (85%) i ustaw random_state=0

  3. Dla danych przeprowadź analizę wykorzystując różne modele danych

  4. Wyświetl nazwę, dokładność oraz odchylenie standardowe modelu

  5. Uruchom doctesty - wszystkie muszą się powieść

Nearest Neighbors | Accuracy: 71.18% (+/- 3.78%)
       Linear SVM | Accuracy: 76.04% (+/- 2.79%)
          RBF SVM | Accuracy: 64.24% (+/- 0.22%)
 Gaussian Process | Accuracy: 68.58% (+/- 3.07%)
    Decision Tree | Accuracy: 68.24% (+/- 4.53%)
    Random Forest | Accuracy: 73.96% (+/- 3.28%)
       Neural Net | Accuracy: 65.28% (+/- 2.75%)
         AdaBoost | Accuracy: 72.57% (+/- 4.16%)
      Naive Bayes | Accuracy: 73.62% (+/- 2.89%)
              QDA | Accuracy: 73.97% (+/- 4.42%)
Hints:
classifiers = [
    {'name': "Nearest Neighbors", 'model': KNeighborsClassifier()},
    {'name': "Linear SVM",        'model': SVC(kernel="linear")},
    {'name': "RBF SVM",           'model': SVC(kernel="rbf")},
    {'name': "Gaussian Process",  'model': GaussianProcessClassifier()},
    {'name': "Decision Tree",     'model': DecisionTreeClassifier()},
    {'name': "Random Forest",     'model': RandomForestClassifier()},
    {'name': "Neural Net",        'model': MLPClassifier(max_iter=1500)},
    {'name': "AdaBoost",          'model': AdaBoostClassifier()},
    {'name': "Naive Bayes",       'model': GaussianNB()},
    {'name': "QDA",               'model': QuadraticDiscriminantAnalysis()},
]
Extra task:
  • Zrównoleglij uruchamianie predykcji za pomocą modułu threading oraz architektury opartej na workerach.

  • Wyświetl posortowaną malejąco listę wg. dokładności