4.4. DataFrame Sample

import pandas as pd
import numpy as np
np.random.seed(0)

df = pd.DataFrame(
    columns = ['Morning', 'Noon', 'Evening', 'Midnight'],
    index = pd.date_range('1999-12-30', periods=7),
    data = np.random.randn(7, 4))

df
#              Morning      Noon   Evening  Midnight
# 1999-12-30  1.764052  0.400157  0.978738  2.240893
# 1999-12-31  1.867558 -0.977278  0.950088 -0.151357
# 2000-01-01 -0.103219  0.410599  0.144044  1.454274
# 2000-01-02  0.761038  0.121675  0.443863  0.333674
# 2000-01-03  1.494079 -0.205158  0.313068 -0.854096
# 2000-01-04 -2.552990  0.653619  0.864436 -0.742165
# 2000-01-05  2.269755 -1.454366  0.045759 -0.187184

4.4.2. Tail

df.tail(2)
#              Morning      Noon   Evening  Midnight
# 2000-01-04 -2.552990  0.653619  0.864436 -0.742165
# 2000-01-05  2.269755 -1.454366  0.045759 -0.187184

df.tail(n=1)
#              Morning      Noon   Evening  Midnight
# 2000-01-05  2.269755 -1.454366  0.045759 -0.187184

4.4.3. First

df.first('Y')
#              Morning      Noon   Evening  Midnight
# 1999-12-30  1.764052  0.400157  0.978738  2.240893
# 1999-12-31  1.867558 -0.977278  0.950088 -0.151357

df.first('M')
#              Morning      Noon   Evening  Midnight
# 1999-12-30  1.764052  0.400157  0.978738  2.240893
# 1999-12-31  1.867558 -0.977278  0.950088 -0.151357

df.first('D')
#              Morning      Noon   Evening  Midnight
# 1999-12-30  1.764052  0.400157  0.978738  2.240893

df.first('W')
#              Morning      Noon   Evening  Midnight
# 1999-12-30  1.764052  0.400157  0.978738  2.240893
# 1999-12-31  1.867558 -0.977278  0.950088 -0.151357
# 2000-01-01 -0.103219  0.410599  0.144044  1.454274
# 2000-01-02  0.761038  0.121675  0.443863  0.333674

4.4.4. Last

df.last('Y')
#              Morning      Noon   Evening  Midnight
# 2000-01-01 -0.103219  0.410599  0.144044  1.454274
# 2000-01-02  0.761038  0.121675  0.443863  0.333674
# 2000-01-03  1.494079 -0.205158  0.313068 -0.854096
# 2000-01-04 -2.552990  0.653619  0.864436 -0.742165
# 2000-01-05  2.269755 -1.454366  0.045759 -0.187184

df.last('M')
#              Morning      Noon   Evening  Midnight
# 2000-01-01 -0.103219  0.410599  0.144044  1.454274
# 2000-01-02  0.761038  0.121675  0.443863  0.333674
# 2000-01-03  1.494079 -0.205158  0.313068 -0.854096
# 2000-01-04 -2.552990  0.653619  0.864436 -0.742165
# 2000-01-05  2.269755 -1.454366  0.045759 -0.187184

df.last('D')
#              Morning      Noon   Evening  Midnight
# 2000-01-05  2.269755 -1.454366  0.045759 -0.187184

df.last('W')
#              Morning      Noon   Evening  Midnight
# 2000-01-03  1.494079 -0.205158  0.313068 -0.854096
# 2000-01-04 -2.552990  0.653619  0.864436 -0.742165
# 2000-01-05  2.269755 -1.454366  0.045759 -0.187184

4.4.5. Sample

  • 1/4 is 25%

  • .05 is 5%

  • 0.5 is 50%

  • 1.0 is 100%

n number or fraction random rows with and without repetition:

df.sample()
#                  Morning      Noon   Evening  Midnight
# 2000-01-01 -0.103219  0.410599  0.144044  1.454274

df.sample(2)
#              Morning      Noon   Evening  Midnight
# 2000-01-03  1.494079 -0.205158  0.313068 -0.854096
# 2000-01-04 -2.552990  0.653619  0.864436 -0.742165

df.sample(n=2, replace=True)
#              Morning      Noon   Evening  Midnight
# 1999-12-31  1.867558 -0.977278  0.950088 -0.151357
# 1999-12-31  1.867558 -0.977278  0.950088 -0.151357

df.sample(frac=1/4)
#              Morning      Noon   Evening  Midnight
# 2000-01-02  0.761038  0.121675  0.443863  0.333674
# 1999-12-31  1.867558 -0.977278  0.950088 -0.151357

df.sample(frac=0.5)
#              Morning      Noon   Evening  Midnight
# 2000-01-05  2.269755 -1.454366  0.045759 -0.187184
# 1999-12-30  1.764052  0.400157  0.978738  2.240893
# 2000-01-01 -0.103219  0.410599  0.144044  1.454274
# 1999-12-31  1.867558 -0.977278  0.950088 -0.151357

4.4.6. Reset Index

df.sample(frac=1.0).reset_index()
#        index   Morning      Noon   Evening  Midnight
# 0 2000-01-02  0.761038  0.121675  0.443863  0.333674
# 1 2000-01-03  1.494079 -0.205158  0.313068 -0.854096
# 2 2000-01-01 -0.103219  0.410599  0.144044  1.454274
# 3 1999-12-31  1.867558 -0.977278  0.950088 -0.151357
# 4 2000-01-05  2.269755 -1.454366  0.045759 -0.187184
# 5 2000-01-04 -2.552990  0.653619  0.864436 -0.742165
# 6 1999-12-30  1.764052  0.400157  0.978738  2.240893
import pandas as pd

DATA = [{'sepal_length': 5.4, 'sepal_width': 3.9, 'petal_length': 1.3, 'petal_width': 0.4, 'species': 'setosa'},
        {'sepal_length': 5.9, 'sepal_width': 3.0, 'petal_length': 5.1, 'petal_width': 1.8, 'species': 'virginica'},
        {'sepal_length': 6.0, 'sepal_width': 3.4, 'petal_length': 4.5, 'petal_width': 1.6, 'species': 'versicolor'},
        {'sepal_length': 7.3, 'sepal_width': 2.9, 'petal_length': 6.3, 'petal_width': 1.8, 'species': 'virginica'},
        {'sepal_length': 5.6, 'sepal_width': 2.5, 'petal_length': 3.9, 'petal_width': 1.1, 'species': 'versicolor'},
        {'sepal_length': 5.4, 'sepal_width': 3.9, 'petal_length': 1.3, 'petal_width': 0.4, 'species': 'setosa'}]

df = pd.read_csv(DATA)

selected = df.sample(frac=0.02)
#      sepal_length  sepal_width  petal_length  petal_width     species
# 98            5.0          3.0           1.6          0.2      setosa
# 64            5.0          3.5           1.6          0.6      setosa
# 105           6.1          2.8           4.0          1.3  versicolor

selected.reset_index()
#    index  sepal_length  sepal_width  petal_length  petal_width     species
# 0     98           5.0          3.0           1.6          0.2      setosa
# 1     64           5.0          3.5           1.6          0.6      setosa
# 2    105           6.1          2.8           4.0          1.3  versicolor

selected.reset_index(drop=True)
#    sepal_length  sepal_width  petal_length  petal_width     species
# 0           5.0          3.0           1.6          0.2      setosa
# 1           5.0          3.5           1.6          0.6      setosa
# 2           6.1          2.8           4.0          1.3  versicolor

4.4.7. Assignments

Code 4.47. Solution
"""
* Assignment: DataFrame Sample
* Complexity: easy
* Lines of code: 4 lines
* Time: 8 min

English:
    TODO: English Translation
    X. Run doctests - all must succeed

Polish:
    1. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
    2. Ustaw wszystkie wiersze w losowej kolejności
    3. Zresetuj index nie pozostawiając kopii zapasowej starego
    4. Zdefiniuj `result` z ostatnimi 10% wierszy
    5. Uruchom doctesty - wszystkie muszą się powieść

Tests:
    >>> import sys; sys.tracebacklimit = 0

    >>> type(result) is pd.DataFrame
    True
    >>> pd.set_option('display.width', 500)
    >>> pd.set_option('display.max_columns', 10)
    >>> pd.set_option('display.max_rows', 10)
    >>> result  # doctest: +NORMALIZE_WHITESPACE
                        Name        Country Gender                                            Flights  Total Flights Total Flight Time (ddd:hh:mm)
    0        Viktor Patsayev   Soviet Union    Man                                    Soyuz 11 (1971)              1                     023:21:21
    1       Stephen G. Bowen  United States    Man     STS-126 (2008), STS-132 (2010), STS-133 (2011)              3                     040:10:04
    2           Sergei Revin         Russia    Man                               Soyuz TMA-04M (2012)              1                     124:23:51
    3         Maksim Surayev         Russia    Man          Soyuz TMA-16 (2009), Soyuz TMA-13M (2014)              2                     334:12:09
    4          Andrew Thomas  United States    Man  STS-77 (1996), STS-89 (1998), STS-102 (2001), ...              4                     177:09:14
    ..                   ...            ...    ...                                                ...            ...                           ...
    562  Lawrence J. DeLucas  United States    Man                                      STS-50 (1992)              1                     013:19:30
    563   Aleksandr Laveykin   Soviet Union    Man                                  Soyuz TM-2 (1987)              1                     174:03:25
    564        Owen Garriott  United States    Man                      Skylab 3 (1973), STS-9 (1983)              2                     069:17:56
    565          Ivan Vagner         Russia    Man                                 Soyuz MS-16 (2020)              1                     145:04:14
    566     Yuri Malenchenko         Russia    Man  Soyuz TM-19 (1994), STS-106 (2000), Soyuz TMA-...              6                     826:09:22
    <BLANKLINE>
    [567 rows x 6 columns]
"""

import pandas as pd
import numpy as np
np.random.seed(0)


DATA = r'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/csv/astro-database.csv'

result = ...