The Europe CDC has the cleanest data that I have seen so far.
The New York Times has github data repository for USA cases.
New York Times COVID-19 Database¶
Data Description¶
Repository and Download¶
The NY Times released their COVID-19 database to github. The submissionREADME.md
warns because of limited testing the counts are under the real counts.
In [1]:
download_url="https://github.com/nytimes/covid-19-data/archive/master.zip"
In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import requests,zipfile,os,re
In [3]:
data_dir="Data/NYT_Covid_19_Databases/"
down_file=data_dir+"NTY_Covid_19.zip"
In [4]:
response=requests.get(download_url)
with open(down_file,'wb') as f: f.write(response.content)
with zipfile.ZipFile(down_file) as zipper: zipper.extractall(data_dir)
In [5]:
def dirTree(rootDir='.'):
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)
In [6]:
dirTree(data_dir)
Description¶
Two csv files exist in the database.
In [7]:
counties_file="Data/NYT_Covid_19_Databases/covid-19-data-master/us-counties.csv"
states_file="Data/NYT_Covid_19_Databases/covid-19-data-master/us-states.csv"
Counties Description¶
In [8]:
counties=pd.read_csv(counties_file)
counties['LogCases']=np.log10(counties.cases+1)
counties['LogDeaths']=np.log10(counties.deaths+1)
In [9]:
counties[:3]
Out[9]:
In [10]:
counties.describe()
Out[10]:
In [11]:
sum((counties.county=='Collin')&(counties.state=='Texas'))
Out[11]:
In [12]:
collin=counties[counties.county=='Collin'].copy()
In [13]:
dallas=counties[(counties.county=='Dallas')&(counties.state=='Texas')].copy()
In [14]:
westchester=counties[(counties.county=='Westchester')&
(counties.state=='New York')&
(counties.date>='2020-03-10')
].copy()
In [15]:
p=collin.plot(x="date",y="LogCases",label="Collin")
dallas.plot(x='date',y='LogCases',label="Dallas",ax=p)
westchester.plot(x='date',y='LogCases',label="Westchester",ax=p);
In [16]:
p=collin.plot(x="date",y="cases",label="Collin")
dallas.plot(x='date',y='cases',label="Dallas",ax=p)
westchester.plot(x='date',y='cases',label="Westchester",ax=p);
States Descriptions¶
In [17]:
states=pd.read_csv(states_file)
states.date=pd.to_datetime(states.date)
states['LogCases']=np.log10(states.cases+1)
states['LogDeaths']=np.log10(states.deaths+1)
In [18]:
states.columns
Out[18]:
In [35]:
texas=states[states.state=='Texas'].copy()
newyork=states[states.state=='New York'].copy()
california=states[states.state=='California'].copy()
florida=states[states.state=='Florida'].copy()
louisiana=states[states.state=='Louisiana'].copy()
virginia=states[states.state=='Virginia'].copy()
georgia=states[states.state=='Georgia'].copy()
In [20]:
p=texas.plot(x="date",y="LogCases",label="Texas")
newyork.plot(x='date',y='LogCases',label="New York",ax=p)
california.plot(x='date',y='LogCases',label="California",ax=p)
florida.plot(x='date',y='LogCases',label="Florida",ax=p)
louisiana.plot(x='date',y='LogCases',label="Louisiana",ax=p)
virginia.plot(x='date',y='LogCases',label="Virginia",ax=p)
georgia.plot(x='date',y='LogCases',label="Georgia",ax=p)
Out[20]:
In [21]:
p=texas.plot(x="date",y="cases",label="Texas")
newyork.plot(x='date',y='cases',label="New York",ax=p)
california.plot(x='date',y='cases',label="California",ax=p)
florida.plot(x='date',y='cases',label="Florida",ax=p)
louisiana.plot(x='date',y='cases',label="Louisiana",ax=p)
virginia.plot(x='date',y='cases',label="Virginia",ax=p)
georgia.plot(x='date',y='cases',label="Georgia",ax=p)
Out[21]:
Saving Data¶
In [22]:
import MySQLdb,sqlalchemy
import datetime as dt
In [23]:
Engine=sqlalchemy.create_engine('mysql://cov19:LionsTigersAndBears@localhost/CoV19')
In [24]:
stamp_time=dt.datetime.now().strftime("%Y%m%d%H%M%S")
In [25]:
states.to_sql("NYT_States_"+stamp_time,con=Engine)
In [26]:
counties.to_sql("NYT_Counties_"+stamp_time,con=Engine)
In [27]:
r=Engine.execute('select * from NYT_States_200403085047 limit 10')
Plotting¶
Here we take a look at using Bokeh.
In [28]:
from bokeh.plotting import figure, output_file,output_notebook, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool
In [29]:
output_notebook()
In [30]:
states.sample(5)
Out[30]:
In [36]:
texas['days_since_3_deaths']=texas.date-texas[texas.deaths>=3].date.min()
In [32]:
astates=[pd.DataFrame(y) for x,y in states.groupby('state',as_index=False)]
In [33]:
p=figure(x_axis_type='datetime',)
for df in astates:
source=ColumnDataSource(df)
p.line(x='date',y='LogDeaths',name=df.state.iloc[0],
line_alpha=0.2,hover_alpha=1.0,source=source,)
p.add_tools(HoverTool(tooltips=[('State','$name')]))
show(p)
In [34]:
p=figure(x_axis_type='datetime')
usa=states.groupby('date').sum().reset_index()
source=ColumnDataSource(usa)
p.line(x='date',y='deaths',line_alpha=1.0,source=source)
show(p)
In [ ]: