5.3. Pandas Read

5.3.1. Import data

  • File paths works also with URLs

  • SQL functions uses SQLAlchemy, which supports many RDBMS

import pandas as pd


# Important
pd.read_csv()
pd.read_excel()
pd.read_html()
pd.read_json()
pd.read_sql()        # Read SQL query or database table into a DataFrame

# Others
pd.read_clipboard()
pd.read_feather()
pd.read_fwf()
pd.read_gbq()
pd.read_hdf()
pd.read_msgpack()
pd.read_parquet()
pd.read_pickle()
pd.read_sas()
pd.read_spss()
pd.read_sql_query()  # Read SQL query into a DataFrame
pd.read_sql_table()  # Read SQL database table into a DataFrame
pd.read_stata()
pd.read_table()

5.3.2. Examples

import pandas as pd

DATA = 'https://python.astrotech.io/_static/iris-clean.csv'
header = pd.read_csv(DATA, nrows=0).columns
import pandas as pd


DATA = 'https://python.astrotech.io/_static/iris-clean.csv'
df = pd.read_csv(DATA)

df.head(3)
#    sepal_length  sepal_width  petal_length  petal_width     species
# 0           5.4          3.9           1.3          0.4      setosa
# 1           5.9          3.0           5.1          1.8   virginica
# 2           6.0          3.4           4.5          1.6  versicolor
import pandas as pd


DATA = 'https://python.astrotech.io/_static/iris-dirty.csv'
COLUMNS =  ['sepal_length', 'sepal_width',
            'petal_length', 'petal_width', 'species']

df = pd.read_csv(DATA)
df.head(3)
#      150    4  setosa  versicolor  virginica
# 0    5.4  3.9     1.3         0.4          0
# 1    5.9  3.0     5.1         1.8          2
# 2    6.0  3.4     4.5         1.6          1

df = pd.read_csv(url, skiprows=1, names=COLUMNS)
df.head(3)
#    sepal_length  sepal_width  petal_length  petal_width  species
# 0           5.4          3.9           1.3          0.4        0
# 1           5.9          3.0           5.1          1.8        2
# 2           6.0          3.4           4.5          1.6        1

df['species'].replace({
    0: 'setosa',
    1: 'versicolor',
    2: 'virginica',
}, inplace=True)
#    sepal_length  sepal_width  petal_length  petal_width  species
# 0           5.4          3.9           1.3          0.4        setosa
# 1           5.9          3.0           5.1          1.8        virginica
# 2           6.0          3.4           4.5          1.6        versicolor

5.3.3. Compressed

  • If the extension is .gz, .bz2, .zip, and .xz, the corresponding compression method is automatically selected

df = pd.read_json('sample_file.gz', compression='infer')

5.3.4. Read HTML

URL = 'https://python.astrotech.io/numerical-analysis/pandas/df-create.html'

pd.read_html(URL)
# Traceback (most recent call last):
#   ...
# urllib.error.HTTPError: HTTP Error 403: Forbidden
import requests

resp = requests.get(URL, headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'})

pd.read_html(resp.content)[0]
#      Crew Role        Astronaut
# 0   Prime  CDR   Neil Armstrong
# 1   Prime  LMP      Buzz Aldrin
# 2   Prime  CMP  Michael Collins
# 3  Backup  CDR     James Lovell
# 4  Backup  LMP   William Anders
# 5  Backup  CMP       Fred Haise

5.3.5. StringIO

  • Converts str to File-like object

from io import StringIO


DATA = """
"Crew", "Role", "Astronaut"
"Prime", "CDR", "Neil Armstrong"
"Prime", "LMP", "Buzz Aldrin"
"Prime", "CMP", "Michael Collins"
"Backup", "CDR", "James Lovell"
"Backup", "LMP", "William Anders"
"Backup", "CMP", "Fred Haise"
"""

data = StringIO(DATA)
pd.read_csv(data)
#      Crew  "Role"         "Astronaut"
# 0   Prime   "CDR"    "Neil Armstrong"
# 1   Prime   "LMP"       "Buzz Aldrin"
# 2   Prime   "CMP"   "Michael Collins"
# 3  Backup   "CDR"      "James Lovell"
# 4  Backup   "LMP"    "William Anders"
# 5  Backup   "CMP"        "Fred Haise"
from io import StringIO


DATA = 'https://python.astrotech.io/_static/astro-flights.csv'
resp = requests.get(DATA)
data = StringIO(resp.text)

pd.read_csv(data)
#      Order           Astronaut         Date       Mission
# 0      1.0        Yuri Gagarin   1961-04-12        Vostok
# 1      2.0       Gherman Titov   1961-08-06      Vostok 2
# 2      3.0   Andrian Nikolayev   1962-08-11      Vostok 3
# 3      4.0      Pavel Popovich   1962-08-12      Vostok 4
# 4      5.0     Valeri Bykovsky   1963-06-14      Vostok 5
# ..     ...                 ...          ...           ...
# 530  531.0      Thomas Pesquet   2016-11-17   Soyuz MS-03
# 531  532.0        Jack Fischer   2017-04-20   Soyuz MS-04
# 532  533.0      Mark Vande Hei   2017-09-12   Soyuz MS-06
# 533  534.0     Norishige Kanai   2017-12-17   Soyuz MS-07
# 534    NaN        Scott Tingle   2017-12-17   Soyuz MS-07
# [535 rows x 4 columns]

5.3.6. Read SQL

import sqlite3
import requests

DATABASE = r'/tmp/astro-timeline.sqlite3'
DATA = r'https://python.astrotech.io/_static/astro-timeline.sqlite3'
SQL = r'SELECT * FROM logs'

with open(DATABASE, mode='wb') as db:
    resp = requests.get(URL)
    db.write(resp.content)

with sqlite3.connect(DATABASE) as db:
    astro_timeline = pd.read_sql(SQL, db, parse_dates=['datetime'])

astro_timeline
#     id  ...                                            message
# 0    1  ...                         Terminal countdown started
# 1    2  ...                          S-IC engine ignition (#5)
# 2    3  ...          Maximum dynamic pressure (735.17 lb/ft^2)
# 3    4  ...                                      S-II ignition
# 4    5  ...                     Launch escape tower jettisoned
# 5    6  ...                          S-II center engine cutoff
# 6    7  ...                               Translunar injection
# 7    8  ...                           CSM docked with LM/S-IVB
# 8    9  ...                     Lunar orbit insertion ignition
# 9   10  ...               Lunar orbit circularization ignition
# 10  11  ...                                    CSM/LM undocked
# 11  12  ...                 LM powered descent engine ignition
# 12  13  ...                                      LM 1202 alarm
# 13  14  ...                                      LM 1201 alarm
# 14  15  ...                                   LM lunar landing
# 15  16  ...                           EVA started (hatch open)
# 16  17  ...                 1st step taken lunar surface (CDR)
# 17  18  ...  That's one small step for [a] man... one giant...
# 18  19  ...        Contingency sample collection started (CDR)
# 19  20  ...                               LMP on lunar surface
# 20  21  ...                           EVA ended (hatch closed)
# 21  22  ...                 LM lunar liftoff ignition (LM APS)
# 22  23  ...                                      CSM/LM docked
# 23  24  ...                Transearth injection ignition (SPS)
# 24  25  ...                                   CM/SM separation
# 25  26  ...                                              Entry
# 26  27  ...                     Splashdown (went to apex-down)
# 27  28  ...                                        Crew egress
# [28 rows x 4 columns]

5.3.7. Assignments

5.3.7.1. Pandas Read CSV Dates

English
  1. Use data from "Input" section (see below)

  2. Read data from DATA to df: pd.DataFrame

  3. Parse dates in "Mission Date" column

Polish
  1. Użyj danych z sekcji "Input" (patrz poniżej)

  2. Wczytaj dane z DATA do df: pd.DataFrame

  3. Sparsuj daty w kolumnie "Mission Date"

Input
DATA = 'https://python.astrotech.io/_static/martian-en.csv'
Hint
  • parse_dates argument

5.3.7.2. Pandas Read CSV Replace

English
  1. Use data from "Input" section (see below)

  2. Read data from DATA to breast_cancer: pd.DataFrame

  3. Use provided column names in COLUMNS

  4. Read labels from the first row

  5. Replace data in label column with values extracted above

  6. Print DataFrame

  7. Print first 5 rows from breast_cancer

Polish
  1. Użyj danych z sekcji "Input" (patrz poniżej)

  2. Wczytaj dane z DATA do breast_cancer: pd.DataFrame

  3. Użyj podanych w COLUMNS nazw kolumn

  4. Wczytaj nazwy labeli z pierwszego wiersza

  5. Podmień dane w kolumnie label na wartości wyciągnięte powyżej

  6. Wypisz pierwsze 5 wierszy z breast_cancer

Input
DATA = 'https://python.astrotech.io/_static/breast-cancer.csv'

COLUMNS = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
           'mean smoothness', 'mean compactness', 'mean concavity',
           'mean concave points', 'mean symmetry', 'mean fractal dimension',
           'radius error', 'texture error', 'perimeter error', 'area error',
           'smoothness error', 'compactness error', 'concavity error',
           'concave points error', 'symmetry error',
           'fractal dimension error', 'worst radius', 'worst texture',
           'worst perimeter', 'worst area', 'worst smoothness',
           'worst compactness', 'worst concavity', 'worst concave points',
           'worst symmetry', 'worst fractal dimension', 'label']
The whys and wherefores
  • Read Pandas DataFrame

Hint
  • pd.read_csv(url, nrows=0).columns

  • df['label'].replace({'from': 'to'}, inplace=True)

5.3.7.3. Pandas Read JSON

English
  1. Use data from "Input" section (see below)

  2. Using pd.read_json() read data from DATA

  3. Print df: pd.DataFrame

Polish
  1. Użyj danych z sekcji "Input" (patrz poniżej)

  2. Za pomocą pd.read_json() wczytaj dane z DATA

  3. Wypisz df: pd.DataFrame

Input
DATA = 'https://python.astrotech.io/_static/iris.json'

5.3.7.4. Pandas Read HTML

English
  1. Use data from "Input" section (see below)

  2. Using pd.read_html() read data from DATA

  3. Print df: pd.DataFrame with active European Space Agency astronauts

Polish
  1. Użyj danych z sekcji "Input" (patrz poniżej)

  2. Za pomocą pd.read_html() wczytaj dane z DATA

  3. Wypisz df: pd.DataFrame z aktywnymi astronautami Europejskiej Agencji Kosmicznej

Input
DATA = 'https://en.wikipedia.org/wiki/European_Astronaut_Corps'
Hint
  • Might require lxml and html5lib: pip install --upgrade lxml html5lib

  • 3rd table

5.3.7.5. Pandas Read XML XSLT

English
  1. Use data from "Input" section (see below)

  2. Read data from DATA as plants: pd.DataFrame

  3. Using XSLT transformation convert it to pandas readable format

  4. Read data to df: pd.DataFrame

  5. Make sure that columns and indexes are named properly

  6. Calculate average cost of flower

Polish
  1. Użyj danych z sekcji "Input" (patrz poniżej)

  2. Wczytaj dane z DATA jako plants: pd.DataFrame

  3. Używając transformaty XSLT sprowadź je do formatu zrozumiałego dla Pandas

  4. Wczytaj dane do df: pd.DataFrame

  5. Upewnij się, że nazwy kolumn i indeks są dobrze ustawione

  6. Wylicz średni koszt kwiatów

Input
DATA = 'https://python.astrotech.io/_static/plants.csv'
Hint
  • Require lxml: pip install lxml