2.1. Pandas Read¶
2.1.1. Import data¶
File paths works also with URLs
SQL functions uses SQLAlchemy, which supports many RDBMS
import pandas as pd
# Important
pd.read_csv()
pd.read_excel()
pd.read_html()
pd.read_json()
pd.read_sql() # Read SQL query or database table into a DataFrame
# Others
pd.read_clipboard()
pd.read_feather()
pd.read_fwf()
pd.read_gbq()
pd.read_hdf()
pd.read_msgpack()
pd.read_parquet()
pd.read_pickle()
pd.read_sas()
pd.read_spss()
pd.read_sql_query() # Read SQL query into a DataFrame
pd.read_sql_table() # Read SQL database table into a DataFrame
pd.read_stata()
pd.read_table()
2.1.2. Examples¶
import pandas as pd
DATA = 'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/csv/iris-clean.csv'
header = pd.read_csv(DATA, nrows=0).columns
import pandas as pd
DATA = 'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/csv/iris-clean.csv'
df = pd.read_csv(DATA)
df.head(3)
# sepal_length sepal_width petal_length petal_width species
# 0 5.4 3.9 1.3 0.4 setosa
# 1 5.9 3.0 5.1 1.8 virginica
# 2 6.0 3.4 4.5 1.6 versicolor
import pandas as pd
DATA = 'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/csv/iris-dirty.csv'
COLUMNS = ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'species']
df = pd.read_csv(DATA)
df.head(3)
# 150 4 setosa versicolor virginica
# 0 5.4 3.9 1.3 0.4 0
# 1 5.9 3.0 5.1 1.8 2
# 2 6.0 3.4 4.5 1.6 1
df = pd.read_csv(url, skiprows=1, names=COLUMNS)
df.head(3)
# sepal_length sepal_width petal_length petal_width species
# 0 5.4 3.9 1.3 0.4 0
# 1 5.9 3.0 5.1 1.8 2
# 2 6.0 3.4 4.5 1.6 1
df['species'].replace({
0: 'setosa',
1: 'versicolor',
2: 'virginica',
}, inplace=True)
# sepal_length sepal_width petal_length petal_width species
# 0 5.4 3.9 1.3 0.4 setosa
# 1 5.9 3.0 5.1 1.8 virginica
# 2 6.0 3.4 4.5 1.6 versicolor
2.1.3. Compressed¶
If the extension is
.gz
,.bz2
,.zip
, and.xz
, the corresponding compression method is automatically selected
df = pd.read_json('sample_file.gz', compression='infer')
2.1.4. Read HTML¶
DATA = 'https://python.astrotech.io/numerical-analysis/pandas/df-create.html'
pd.read_html(DATA)
# Traceback (most recent call last):
# urllib.error.HTTPError: HTTP Error 403: Forbidden
import requests
DATA = 'https://python.astrotech.io/numerical-analysis/pandas/df-create.html'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
resp = requests.get(DATA, headers={'User-Agent': USER_AGENT})
dfs = pd.read_html(resp.content)
dfs[0]
# Crew Role Astronaut
# 0 Prime CDR Neil Armstrong
# 1 Prime LMP Buzz Aldrin
# 2 Prime CMP Michael Collins
# 3 Backup CDR James Lovell
# 4 Backup LMP William Anders
# 5 Backup CMP Fred Haise
2.1.5. StringIO¶
Converts
str
to File-like object
from io import StringIO
DATA = """
"Crew", "Role", "Astronaut"
"Prime", "CDR", "Neil Armstrong"
"Prime", "LMP", "Buzz Aldrin"
"Prime", "CMP", "Michael Collins"
"Backup", "CDR", "James Lovell"
"Backup", "LMP", "William Anders"
"Backup", "CMP", "Fred Haise"
"""
data = StringIO(DATA)
df = pd.read_csv(data)
df
# Crew "Role" "Astronaut"
# 0 Prime "CDR" "Neil Armstrong"
# 1 Prime "LMP" "Buzz Aldrin"
# 2 Prime "CMP" "Michael Collins"
# 3 Backup "CDR" "James Lovell"
# 4 Backup "LMP" "William Anders"
# 5 Backup "CMP" "Fred Haise"
from io import StringIO
DATA = 'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/csv/astro-order.csv'
resp = requests.get(DATA)
data = StringIO(resp.text)
df = pd.read_csv(data)
df
# Order Astronaut Date Mission
# 0 1.0 Yuri Gagarin 1961-04-12 Vostok
# 1 2.0 Gherman Titov 1961-08-06 Vostok 2
# 2 3.0 Andrian Nikolayev 1962-08-11 Vostok 3
# 3 4.0 Pavel Popovich 1962-08-12 Vostok 4
# 4 5.0 Valeri Bykovsky 1963-06-14 Vostok 5
# .. ... ... ... ...
# 530 531.0 Thomas Pesquet 2016-11-17 Soyuz MS-03
# 531 532.0 Jack Fischer 2017-04-20 Soyuz MS-04
# 532 533.0 Mark Vande Hei 2017-09-12 Soyuz MS-06
# 533 534.0 Norishige Kanai 2017-12-17 Soyuz MS-07
# 534 NaN Scott Tingle 2017-12-17 Soyuz MS-07
# [535 rows x 4 columns]
2.1.6. Read SQL¶
import sqlite3
import requests
DATA = r'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/sqlite3/astro-timeline.sqlite3'
DATABASE = r'/tmp/astro-timeline.sqlite3'
SQL = """
SELECT *
FROM logs
"""
with open(DATABASE, mode='wb') as db:
resp = requests.get(DATA)
db.write(resp.content)
with sqlite3.connect(DATABASE) as db:
astro_timeline = pd.read_sql(SQL, db, parse_dates=['datetime'])
astro_timeline
# id ... message
# 0 1 ... Terminal countdown started
# 1 2 ... S-IC engine ignition (#5)
# 2 3 ... Maximum dynamic pressure (735.17 lb/ft^2)
# 3 4 ... S-II ignition
# 4 5 ... Launch escape tower jettisoned
# 5 6 ... S-II center engine cutoff
# 6 7 ... Translunar injection
# 7 8 ... CSM docked with LM/S-IVB
# 8 9 ... Lunar orbit insertion ignition
# 9 10 ... Lunar orbit circularization ignition
# 10 11 ... CSM/LM undocked
# 11 12 ... LM powered descent engine ignition
# 12 13 ... LM 1202 alarm
# 13 14 ... LM 1201 alarm
# 14 15 ... LM lunar landing
# 15 16 ... EVA started (hatch open)
# 16 17 ... 1st step taken lunar surface (CDR)
# 17 18 ... That's one small step for [a] man... one giant...
# 18 19 ... Contingency sample collection started (CDR)
# 19 20 ... LMP on lunar surface
# 20 21 ... EVA ended (hatch closed)
# 21 22 ... LM lunar liftoff ignition (LM APS)
# 22 23 ... CSM/LM docked
# 23 24 ... Transearth injection ignition (SPS)
# 24 25 ... CM/SM separation
# 25 26 ... Entry
# 26 27 ... Splashdown (went to apex-down)
# 27 28 ... Crew egress
# [28 rows x 4 columns]
2.1.7. XML and XSLT¶
from io import StringIO
from lxml.etree import XML, XSLT, parse
import pandas as pd
DATA = """<?xml version="1.0"?>
<catalog>
<book id="bk101">
<author>Gambardella, Matthew</author>
<title>XML Developer's Guide</title>
<genre>Computer</genre>
<price>44.95</price>
<publish_date>2000-10-01</publish_date>
<description>An in-depth look at creating applications
with XML.</description>
</book>
<book id="bk102">
<author>Ralls, Kim</author>
<title>Midnight Rain</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-12-16</publish_date>
<description>A former architect battles corporate zombies,
an evil sorceress, and her own childhood to become queen
of the world.</description>
</book>
<book id="bk103">
<author>Corets, Eva</author>
<title>Maeve Ascendant</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-11-17</publish_date>
<description>After the collapse of a nanotechnology
society in England, the young survivors lay the
foundation for a new society.</description>
</book>
</catalog>
"""
TEMPLATE = """
<html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<table>
<thead>
<tr>
<th>Id</th>
<th>Author</th>
<th>Title</th>
<th>Genre</th>
<th>Price</th>
<th>Publish Date</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<xsl:for-each select="catalog/book">
<tr>
<td><xsl:value-of select="@id"/></td>
<td><xsl:value-of select="author"/></td>
<td><xsl:value-of select="title"/></td>
<td><xsl:value-of select="genre"/></td>
<td><xsl:value-of select="price"/></td>
<td><xsl:value-of select="publish_date"/></td>
<td><xsl:value-of select="description"/></td>
</tr>
</xsl:for-each>
</tbody>
</table>
</html>
"""
transform = XSLT(XML(TEMPLATE))
data = parse(StringIO(DATA))
html = str(transform(data))
dfs = pd.read_html(html)
result = dfs[0]
result
# [ Id ... Description
# 0 bk101 ... An in-depth look at creating applications wit...
# 1 bk102 ... A former architect battles corporate zombies, ...
# 2 bk103 ... After the collapse of a nanotechnology societ...
# [3 rows x 7 columns]]
type(result) is pd.DataFrame
# True
len(result) > 0
# True
result.columns
# Index(['Id', 'Author', 'Title', 'Genre', 'Price', 'Publish Date',
# 'Description'],
# dtype='object')
result['Title']
# 0 XML Developer's Guide
# 1 Midnight Rain
# 2 Maeve Ascendant
# Name: Title, dtype: object
2.1.8. Assignments¶
"""
* Assignment: Pandas Read CSV Dates
* Complexity: easy
* Lines of code: 1 lines
* Time: 3 min
English:
1. Use data from "Given" section (see below)
2. Read data from `DATA` to `result: pd.DataFrame`
3. Parse dates in "Mission Date" column
4. Print `result`
Polish:
1. Użyj danych z sekcji "Given" (patrz poniżej)
2. Wczytaj dane z `DATA` do `result: pd.DataFrame`
3. Sparsuj daty w kolumnie "Mission Date"
4. Wypisz `result`
Hints:
* `parse_dates`
Tests:
>>> type(result) is pd.DataFrame
True
>>> len(result) > 0
True
>>> result
id First Name Last Name Mission Date
0 1 Jan Twardowski 1988-01-05
1 2 Mark Watney 1969-07-21
2 3 Ivan Ivanovich 1961-04-12
3 4 Melissa Lewis 1970-01-01
4 5 Alex Vogel 1968-12-25
"""
# Given
import pandas as pd
DATA = 'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/csv/martian-en.csv'
"""
* Assignment: Pandas Read CSV Replace
* Complexity: easy
* Lines of code: 5 lines
* Time: 8 min
English:
1. Use data from "Given" section (see below)
2. Read data from `DATA` to `result: pd.DataFrame`
3. Use provided column names in `COLUMNS`
4. Read labels from the first row
5. Replace data in `label` column with values extracted above
6. Define `result: pd.DataFrame` with 20 first rows
Polish:
1. Użyj danych z sekcji "Given" (patrz poniżej)
2. Wczytaj dane z `DATA` do `result: pd.DataFrame`
3. Użyj podanych w `COLUMNS` nazw kolumn
4. Wczytaj nazwy labeli z pierwszego wiersza
5. Podmień dane w kolumnie `label` na wartości wyciągnięte powyżej
6. Zdefiniuj `result: pd.DataFrame` z 20stoma pierwszymi wierszami
Hints:
* `hader = pd.read_csv(url, nrows=0).columns`
* `cancer_types = dict(enumerate(header[2:]))`
* `df['label'].replace({'from': 'to'}, inplace=True)`
Tests:
>>> type(result) is pd.DataFrame
True
>>> len(result) == 20
True
>>> result.loc[[0,9,19], ['mean radius', 'mean texture', 'label']]
mean radius mean texture label
0 17.99 10.38 malignant
9 12.46 24.04 malignant
19 13.54 14.36 benign
"""
# Given
import pandas as pd
DATA = 'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/csv/breast-cancer.csv'
COLUMNS = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error',
'fractal dimension error', 'worst radius', 'worst texture',
'worst perimeter', 'worst area', 'worst smoothness',
'worst compactness', 'worst concavity', 'worst concave points',
'worst symmetry', 'worst fractal dimension', 'label']
"""
* Assignment: Pandas Read JSON
* Complexity: easy
* Lines of code: 1 lines
* Time: 3 min
English:
1. Use data from "Given" section (see below)
2. Read data from `DATA` as `result: pd.DataFrame`
3. Print `result`
Polish:
1. Użyj danych z sekcji "Given" (patrz poniżej)
2. Wczytaj dane z DATA jako result: pd.DataFrame
3. Wypisz `result`
Tests:
>>> type(result) is pd.DataFrame
True
>>> len(result) > 0
True
>>> result.loc[[0,10,20]]
sepalLength sepalWidth petalLength petalWidth species
0 5.1 3.5 1.4 0.2 setosa
10 7.0 3.2 4.7 1.4 versicolor
20 6.3 3.3 6.0 2.5 virginica
"""
# Given
import pandas as pd
DATA = 'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/json/iris.json'
"""
* Assignment: Pandas Read JSON OpenAPI
* Complexity: easy
* Lines of code: 3 lines
* Time: 5 min
English:
1. Use data from "Given" section (see below)
2. Define `resp` with result of `requests.get()` for `DATA`
3. Define `data` with conversion of `resp` from JSON to Python dict by calling `.json()` on `resp`
5. Define `result: pd.DataFrame` from value for key `paths` in `data` dict
Polish:
1. Użyj danych z sekcji "Given" (patrz poniżej)
2. Zdefiniuj `resp` z resultatem `requests.get()` dla `DATA`
3. Zdefiniuj `data` z przekształceniem `resp` z JSON do Python dict wywołując `.json()` na `resp`
4. Zdefiniuj `result: pd.DataFrame` dla wartości z klucza `paths` w słowniku `data`
Hints:
* `pd.DataFrame(data)`
Tests:
>>> type(result) is pd.DataFrame
True
>>> len(result) > 0
True
>>> list(result.index)
['put', 'post', 'get', 'delete']
>>> list(result.columns) # doctest: +NORMALIZE_WHITESPACE
['/pet', '/pet/findByStatus', '/pet/findByTags', '/pet/{petId}', '/pet/{petId}/uploadImage',
'/store/inventory', '/store/order', '/store/order/{orderId}',
'/user', '/user/createWithList', '/user/login', '/user/logout', '/user/{username}']
"""
# Given
import pandas as pd
import requests
DATA = 'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/json/openapi.json'
resp = ...
data = ...
result = ...
"""
* Assignment: Pandas Read HTML
* Complexity: easy
* Lines of code: 2 lines
* Time: 5 min
English:
1. Use data from "Given" section (see below)
2. Read data from `DATA` as `data: pd.DataFrame`
3. Define `result` with active European Space Agency astronauts
Polish:
1. Użyj danych z sekcji "Given" (patrz poniżej)
2. Wczytaj dane z `DATA` jako `data: pd.DataFrame`
3. Zdefiniuj `result` z aktywnymi astronautami Europejskiej Agencji Kosmicznej
Hints:
* `pip install --upgrade lxml`
* 3rd table
Tests:
>>> type(result) is pd.DataFrame
True
>>> len(result) > 0
True
>>> result['Name']
0 Samantha Cristoforetti
1 Alexander Gerst
2 Andreas Mogensen
3 Luca Parmitano
4 Timothy Peake
5 Thomas Pesquet
6 Matthias Maurer
Name: Name, dtype: object
"""
# Given
import pandas as pd
DATA = 'https://en.wikipedia.org/wiki/European_Astronaut_Corps'
"""
* Assignment: Pandas Read XSLT Plants
* Complexity: medium
* Lines of code: 4 lines
* Time: 5 min
English:
1. Use data from "Given" section (see below)
2. Read data from `DATA` as `result: pd.DataFrame`
3. Use XSLT transformation
4. Make sure that columns and indexes are named properly
5. Calculate average cost of flower
Polish:
1. Użyj danych z sekcji "Given" (patrz poniżej)
2. Wczytaj dane z `DATA` jako `result: pd.DataFrame`
3. Użyj transformaty XSLT
4. Upewnij się, że nazwy kolumn i indeks są dobrze ustawione
5. Wylicz średni koszt kwiatów
Hints:
* `pip install --upgrade lxml`
Tests:
>>> type(result) is pd.DataFrame
True
>>> len(result) > 0
True
>>> result
English Name Latin Name Cost
0 Bloodroot Sanguinaria canadensis $2.44
1 Columbine Aquilegia canadensis $9.37
2 Marsh Marigold Caltha palustris $6.81
3 Cowslip Caltha palustris $9.90
"""
# Given
import pandas as pd
from io import StringIO
from lxml.etree import XML, XSLT, parse
DATA = """
<CATALOG>
<PLANT>
<COMMON>Bloodroot</COMMON>
<BOTANICAL>Sanguinaria canadensis</BOTANICAL>
<ZONE>4</ZONE>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$2.44</PRICE>
<AVAILABILITY>031599</AVAILABILITY>
</PLANT>
<PLANT>
<COMMON>Columbine</COMMON>
<BOTANICAL>Aquilegia canadensis</BOTANICAL>
<ZONE>3</ZONE>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$9.37</PRICE>
<AVAILABILITY>030699</AVAILABILITY>
</PLANT>
<PLANT>
<COMMON>Marsh Marigold</COMMON>
<BOTANICAL>Caltha palustris</BOTANICAL>
<ZONE>4</ZONE>
<LIGHT>Mostly Sunny</LIGHT>
<PRICE>$6.81</PRICE>
<AVAILABILITY>051799</AVAILABILITY>
</PLANT>
<PLANT>
<COMMON>Cowslip</COMMON>
<BOTANICAL>Caltha palustris</BOTANICAL>
<ZONE>4</ZONE>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$9.90</PRICE>
<AVAILABILITY>030699</AVAILABILITY>
</PLANT>
</CATALOG>
"""
TEMPLATE = """
<html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<table>
<thead>
<tr>
<th>English Name</th>
<th>Latin Name</th>
<th>Cost</th>
</tr>
</thead>
<xsl:for-each select="CATALOG/PLANT">
<tr>
<td><xsl:value-of select="COMMON"/></td>
<td><xsl:value-of select="BOTANICAL"/></td>
<td><xsl:value-of select="PRICE"/></td>
</tr>
</xsl:for-each>
</table>
</html>
"""