7.2. CSV COVID19¶
Data Source: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
7.2.1. Case Study - 0x01¶
#%% Imports
from datetime import date
import pandas as pd
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday
from pandas.tseries.holiday import EasterMonday, Easter
from pandas.tseries.offsets import Day
import matplotlib.pyplot as plt
import matplotlib.axes
#%% Settings
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 100)
#%% Data Sources
#CONFIRMED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
#DEATHS = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
#RECOVERED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
CONFIRMED = 'https://python3.info/_static/covid19-confirmed.csv'
DEATHS = 'https://python3.info/_static/covid19-deaths.csv'
RECOVERED = 'https://python3.info/_static/covid19-recovered.csv'
#%% Data Frames
confirmed = pd.read_csv(CONFIRMED)
deaths = pd.read_csv(DEATHS)
recovered = pd.read_csv(RECOVERED)
#%% Get Country from DataFrame
def covid19(country: str = None) -> pd.DataFrame:
"""
Get Confirmed, Deaths, Recovered for given country
>>> covid19('Poland').loc['2022-01-01']
Confirmed 4120248
Deaths 97559
Recovered 0
Name: 2022-01-01 00:00:00, dtype: int64
>>> covid19('France').loc['2022-01-01']
Confirmed 10296909
Deaths 124839
Recovered 0
Name: 2022-01-01 00:00:00, dtype: int64
>>> covid19().loc['2022-01-01']
Confirmed 289931319
Deaths 5473487
Recovered 0
Name: 2022-01-01 00:00:00, dtype: int64
"""
def _get(data: pd.DataFrame, country: str = None) -> pd.Series:
"""
Get Country from DataFrame
>>> _get(confirmed, 'Poland').loc['2022-01-01']
4120248
>>> _get(deaths, 'Poland').loc['2022-01-01']
97559
>>> _get(recovered, 'Poland').loc['2022-01-01']
0
"""
if country is not None:
data = data.query('`Country/Region` == @country')
return (data
.transpose()
.iloc[4:]
.sum(axis='columns')
.astype('int64')
.rename(pd.to_datetime, axis='index'))
return pd.DataFrame({
'Confirmed': _get(confirmed, country),
'Deaths': _get(deaths, country),
'Recovered': _get(recovered, country)})
#%% Calendars
class PLHolidayCalendar(AbstractHolidayCalendar):
"""
Custom Holiday calendar for Poland based on
https://en.wikipedia.org/wiki/Public_holidays_in_Poland
"""
rules = [
Holiday('New Years Day', month=1, day=1),
Holiday('Epiphany', month=1, day=6),
Holiday('Easter', month=1, day=1, offset=[Easter()]),
EasterMonday,
Holiday('May Day', month=5, day=1),
Holiday('Constitution Day', month=5, day=3),
Holiday('Pentecost Sunday', month=1, day=1, offset=[Easter(), Day(49)]),
Holiday('Corpus Christi', month=1, day=1, offset=[Easter(), Day(60)]),
Holiday('Assumption of the Blessed Virgin Mary', month=8, day=15),
Holiday('All Saints Day', month=11, day=1),
Holiday('Independence Day', month=11, day=11),
Holiday('Christmas Day', month=12, day=25),
Holiday('Second Day of Christmastide', month=12, day=26),
]
#%% Show trendline
def plot_trendline(data: pd.DataFrame) -> matplotlib.axes.Axes:
return (data
.loc[:, ['Confirmed','Deaths']]
.plot(kind='line',
subplots=True,
layout=(2, 1),
figsize=(10, 10)))
#%% Show fatalities
def plot_fatalities(data: pd.DataFrame) -> matplotlib.axes.Axes:
return ((data['Deaths'] / data['Confirmed'])
.mul(100) # convert to percent
.round(2)
.plot(kind='line',
title='Percent of deaths vs new cases in last two weeks',
xlabel='Day',
ylabel='Percent',
ylim=(0.0, 6.0),
figsize=(10, 10),
grid=True))
#%% Confirmed cases day-by-day difference
def plot_confirmed_daily(data: pd.DataFrame) -> matplotlib.axes.Axes:
return data['Confirmed'].diff().plot()
#%% Covid infection waves
def plot_confirmed_waves(data:pd.DataFrame) -> matplotlib.axes.Axes:
return data['Confirmed'].diff().resample('W').median().plot()
#%% Confirmed cases in last two weeks
def plot_confirmed_last(data: pd.DataFrame, freq='2W') -> matplotlib.axes.Axes:
return data['Confirmed'].last(freq).diff().plot()
#%% Confirmed cases every month
def plot_confirmed_monthly(data: pd.DataFrame) -> matplotlib.axes.Axes:
return data['Confirmed'].resample('M').sum().plot()
#%%
def plot_confirmed_after_holidays(
data: pd.DataFrame,
since: date | str | None = '2021-01-01',
until: date | str | None = '2022-02-07',
days: int = 14,
calendar: AbstractHolidayCalendar = PLHolidayCalendar(),
) -> matplotlib.axes.Axes:
"""
Confirmed cases in period of 14 days after holidays
"""
def _get(since, days):
return (data
.loc[since:, 'Confirmed']
.iloc[:days]
.reset_index(drop=True))
data = {column: _get(since=holiday, days=days)
for holiday in calendar.holidays(since, until)
if (column := holiday.strftime('%Y-%m-%d'))}
return pd.DataFrame(data).diff().plot(
kind='line',
subplots=True,
layout=(15,1),
sharex=True,
figsize=(5, 15),
grid=True)
#%% Run
if __name__ == '__main__':
poland = covid19('Poland')
usa = covid19('US')
france = covid19('France')
china = covid19('China')
world = covid19()
data = poland.loc['2020-01-01':'2022-02-01']
plot_trendline(data)
# plt.show()
plot_fatalities(data)
# plt.show()
plot_confirmed_daily(data)
# plt.show()
plot_confirmed_waves(data)
# plt.show()
plot_confirmed_last(data)
# plt.show()
plot_confirmed_monthly(data)
# plt.show()
plot_confirmed_after_holidays(data)
# plt.show()
"""TODO:
# Resample
poland['Confirmed'].shift(periods=1, freq='D').plot(kind='line')
# Z rozróżnianiem na kwartały (Q)
plot = poland['Confirmed'].resample('Q').plot(kind='line', legend=True)
plot[0].name = '2020-Q1'
plot[1].name = '2020-Q2'
plot[2].name = '2020-Q3'
plot[3].name = '2020-Q4'
plot[4].name = '2021-Q1'
plot[5].name = '2021-Q2'
plot[6].name = '2021-Q3'
plot[7].name = '2021-Q4'
plot[8].name = '2022-Q1'
plt.show()
# Makrotrendy
poland['Confirmed'].diff().rolling(window=14).median().plot()
plt.show()
poland['Confirmed'].diff().rolling(window=14).median().plot()
plt.show()
poland['Confirmed'].diff().resample('W').median().plot()
plt.show()
poland['Confirmed'].diff().resample('M').median().plot()
plt.show()
poland['Confirmed'].diff().resample('Q').median().plot()
plt.show()
"""
7.2.2. Case Study - 0x02¶
from datetime import date
import pandas as pd
from doctest import testmod as run_tests
from matplotlib import pyplot as plt
import pandas as pd
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday, EasterMonday, Easter
from pandas.tseries.offsets import Day
PROCENT = 1
#%%
# CONFIRMED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
# RECOVERED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
# DEATHS = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
CONFIRMED = 'https://python3.info/_static/covid19-confirmed.csv'
RECOVERED = 'https://python3.info/_static/covid19-recovered.csv'
DEATHS = 'https://python3.info/_static/covid19-deaths.csv'
confirmed = pd.read_csv(CONFIRMED).convert_dtypes()
recovered = pd.read_csv(RECOVERED).convert_dtypes()
deaths = pd.read_csv(DEATHS).convert_dtypes()
class PLHolidayCalendar(AbstractHolidayCalendar):
"""
Custom Holiday calendar for Poland based on
https://en.wikipedia.org/wiki/Public_holidays_in_Poland
>>> PLHolidayCalendar().holidays(start='2000-01-01', end='2000-12-31')
DatetimeIndex(['2000-01-01', '2000-01-06', '2000-04-23', '2000-04-24',
'2000-05-01', '2000-05-03', '2000-06-11', '2000-06-22',
'2000-08-15', '2000-11-01', '2000-11-11', '2000-12-25',
'2000-12-26'],
dtype='datetime64[ns]', freq=None)
"""
rules = [
Holiday('New Years Day', month=1, day=1),
Holiday('Epiphany', month=1, day=6),
Holiday('Easter', month=1, day=1, offset=[Easter()]),
EasterMonday,
Holiday('May Day', month=5, day=1),
Holiday('Constitution Day', month=5, day=3),
Holiday('Pentecost Sunday', month=1, day=1, offset=[Easter(), Day(49)]),
Holiday('Corpus Christi', month=1, day=1, offset=[Easter(), Day(60)]),
Holiday('Assumption of the Blessed Virgin Mary', month=8, day=15),
Holiday('All Saints Day', month=11, day=1),
Holiday('Independence Day', month=11, day=11),
Holiday('Christmas Day', month=12, day=25),
Holiday('Second Day of Christmastide', month=12, day=26),
]
#%%
def _parse(data, country, name):
"""
>>> _parse(confirmed, 'Poland', name='confirmed').loc['2021-08-04']
confirmed 2883448
Name: 2021-08-04 00:00:00, dtype: int64
>>> _parse(confirmed, 'Poland', name='confirmed').loc['2021-08-05']
confirmed 2883624
Name: 2021-08-05 00:00:00, dtype: int64
>>> _parse(recovered, 'Poland', name='recovered').loc['2021-08-04']
recovered 2653981
Name: 2021-08-04 00:00:00, dtype: int64
>>> _parse(recovered, 'Poland', name='recovered').loc['2021-08-05']
recovered 0
Name: 2021-08-05 00:00:00, dtype: int64
>>> _parse(deaths, 'Poland', name='deaths').loc['2021-08-04']
deaths 75269
Name: 2021-08-04 00:00:00, dtype: int64
>>> _parse(deaths, 'Poland', name='deaths').loc['2021-08-05']
deaths 75275
Name: 2021-08-05 00:00:00, dtype: int64
"""
if country is not None:
query = data['Country/Region'] == country
data = data.loc[query]
return (
data
.transpose()
.iloc[4:]
.sum(axis='columns')
.astype('int')
.to_frame()
.rename(lambda x: name, axis='columns')
.rename(pd.to_datetime, axis='index'))
#%%
def get(country=None):
"""
>>> get('Poland').loc['2021-08-04']
confirmed 2883448
recovered 2653981
deaths 75269
Name: 2021-08-04 00:00:00, dtype: int64
>>> get('Poland').loc['2021-08-05']
confirmed 2883624
recovered 0
deaths 75275
Name: 2021-08-05 00:00:00, dtype: int64
>>> get('United Kingdom').loc['2021-08-04']
confirmed 5980830
recovered 24693
deaths 157209
Name: 2021-08-04 00:00:00, dtype: int64
>>> get('United Kingdom').loc['2021-08-05']
confirmed 6010860
recovered 0
deaths 157314
Name: 2021-08-05 00:00:00, dtype: int64
>>> get().loc['2021-08-04']
confirmed 200758580
recovered 130899061
deaths 4283131
Name: 2021-08-04 00:00:00, dtype: int64
>>> get().loc['2021-08-05']
confirmed 201444202
recovered 0
deaths 4294122
Name: 2021-08-05 00:00:00, dtype: int64
"""
return pd.concat((
_parse(confirmed, country, name='confirmed'),
_parse(recovered, country, name='recovered'),
_parse(deaths, country, name='deaths'),
), axis='columns')
#%%
poland = get('Poland')
germany = get('Germany')
india = get('India')
uk = get('United Kingdom')
france = get('France')
china = get('China')
world = get()
#%%
def liczba_potwierdzonych_oraz_smierci_w_tygodniowych_okresach():
return world.loc[:, ['confirmed', 'deaths']].resample('W').sum()
def liczba_zachorowan_na_jeden_przypadek_smiertelny():
return world['confirmed'] / world['deaths']
def procent_smiertelnosci():
return world['deaths'] / world['confirmed'] * 100*PROCENT
def get_holidays(year: int, calendar: AbstractHolidayCalendar) -> pd.DatetimeIndex:
return calendar.holidays(start=date(year, 1, 1), end=date(year, 12, 31))
def liczba_zachorowan_po_swietach(year, calendar=PLHolidayCalendar()):
"""
>>> data = liczba_zachorowan_po_swietach(year=2022)
>>> plot = data.plot(
... kind='line',
... subplots=True,
... sharey=True,
... sharex=True,
... grid=True,
... figsize=(10,20))
>>> # plt.show()
"""
today = pd.Timestamp('today')
holidays = get_holidays(year, calendar)
holidays_until_today = holidays[holidays < today]
def days_after_holiday(holiday, days=10):
return (poland
.loc[holiday:, 'confirmed']
.iloc[:days]
.diff()
.reset_index(drop=True)
.iloc[1:]
.astype('int'))
return pd.DataFrame({
column_name: days_after_holiday(swieto)
for i, swieto in enumerate(holidays_until_today)
if (column_name := format(swieto, '%Y-%m-%d'))
})
7.2.3. Case Study - 0x03¶
from doctest import testmod as run_tests
import pandas as pd
from matplotlib import pyplot as plt
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_seq_items', 100)
#%%
# CONFIRMED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
# RECOVERED = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
# DEATHS = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
CONFIRMED = 'https://python3.info/_static/covid19-confirmed.csv'
RECOVERED = 'https://python3.info/_static/covid19-recovered.csv'
DEATHS = 'https://python3.info/_static/covid19-deaths.csv'
COLUMNS = {
'Province/State': 'region',
'Country/Region': 'country',
}
confirmed = pd.read_csv(CONFIRMED).rename(columns=COLUMNS)
deaths = pd.read_csv(DEATHS).rename(columns=COLUMNS)
recovered = pd.read_csv(RECOVERED).rename(columns=COLUMNS)
#%%
def _get(df: pd.DataFrame, country: str, name: str) -> pd.Series:
"""
>>> _get(confirmed, 'Poland', 'confirmed').loc['2021-01-01']
1305774
>>> _get(deaths, 'Poland', 'deaths').loc['2021-01-01']
28956
>>> _get(recovered, 'Poland', 'recovered').loc['2021-01-01']
1046281
"""
if country is not None:
df = df.query('country == @country')
return (df
.transpose()
.iloc[4:]
.sum(axis='columns')
.rename(name)
.rename(index=pd.to_datetime)
.astype('int64')
.convert_dtypes())
def covid19(country: str = None) -> pd.DataFrame:
"""
>>> covid19('Poland').loc['2021-01-01']
confirmed 1305774
deaths 28956
recovered 1046281
Name: 2021-01-01 00:00:00, dtype: Int64
>>> covid19('US').loc['2021-01-01']
confirmed 20397400
deaths 352804
recovered 0
Name: 2021-01-01 00:00:00, dtype: Int64
>>> covid19('China').loc['2021-01-01']
confirmed 102649
deaths 4884
recovered 90031
Name: 2021-01-01 00:00:00, dtype: Int64
"""
return pd.concat((
_get(confirmed, country, name='confirmed'),
_get(deaths, country, name='deaths'),
_get(recovered, country, name='recovered')
), axis='columns')
run_tests()
#%%
pl = covid19('Poland')
us = covid19('US')
india = covid19('India')
china = covid19('China')
france = covid19('France')
world = covid19()
#%%
data = pl['confirmed']
plot_confirmed_total = data.plot(
kind='line',
label='Confirmed',
title='Total confirmed cases in Poland',
xlabel='Date',
ylabel='Total confirmed cases',)
plt.tight_layout()
# plt.show()
#%%
data = pl['confirmed'].diff()
plot_confirmed_daily = data.plot(
kind='line',
label='Confirmed',
title='Daily confirmed cases in Poland',
xlabel='Date',
ylabel='Daily confirmed cases',)
plt.tight_layout()
# plt.show()
# %%
def mortality(df: pd.DataFrame, since='2020-04-01', until=None) -> pd.Series:
return (df.deaths / df.confirmed).loc[slice(since,until)].mul(100).dropna()
data = mortality(pl)
plot_mortality = data.plot(
kind='line',
title='Mortality in Poland',
ylabel='mortality [%]',
label='Mortality',
xlabel='date')
plt.hlines(data.mean(), xmin=data.index.min(), xmax=data.index.max(), color='red', label='Mean')
plt.legend()
plt.tight_layout()
# plt.show()