Data Analysis
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 1/16
location date variant num_sequences perc_sequences num_sequences_total
0 Angola 2020-07-06 Alpha 0 0.0 3
1 Angola 2020-07-06 B.1.1.277 0 0.0 3
2 Angola 2020-07-06 B.1.1.302 0 0.0 3
3 Angola 2020-07-06 B.1.1.519 0 0.0 3
4 Angola 2020-07-06 B.1.160 0 0.0 3
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100416 entries, 0 to 100415 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 location 100416 non-null object 1 date 100416 non-null datetime64[ns] 2 variant 100416 non-null object 3 num_sequences 100416 non-null int64 4 perc_sequences 100416 non-null float64 5 num_sequences_total 100416 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(2), object(2) memory usage: 4.6+ MB
Alpha 4184 B.1.1.277 4184 others 4184 S:677P.Pelican 4184 S:677H.Robin1 4184 Omicron 4184 Mu 4184
In [1]: import numpy as np import pandas as pd
In [2]: import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline
In [3]: from sklearn.linear_model import LinearRegression
In [4]: df = pd.read_csv('covid-variants.csv', parse_dates=["date"]) df.head()
Out[4]:
In [5]: df.info()
In [6]: df.variant.value_counts()
Out[6]:
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 2/16
Lambda 4184 Kappa 4184 Iota 4184 Gamma 4184 Eta 4184 Epsilon 4184 Delta 4184 Beta 4184 B.1.620 4184 B.1.367 4184 B.1.258 4184 B.1.221 4184 B.1.177 4184 B.1.160 4184 B.1.1.519 4184 B.1.1.302 4184 non_who 4184 Name: variant, dtype: int64
0 84173 1 2753 2 1405 3 905 4 631 ... 1690 1 1719 1 2156 1 1184 1 862 1 Name: num_sequences, Length: 1563, dtype: int64
num_sequences perc_sequences num_sequences_total
count 100416.000000 100416.000000 100416.000000
mean 72.171676 6.154355 1509.582457
std 1669.262169 21.898989 8445.291772
min 0.000000 -0.010000 1.000000
25% 0.000000 0.000000 12.000000
50% 0.000000 0.000000 59.000000
75% 0.000000 0.000000 394.000000
max 142280.000000 100.000000 146170.000000
<AxesSubplot:xlabel='date', ylabel='num_sequences'>
In [7]: df.num_sequences.value_counts()
Out[7]:
In [8]: df.describe()
Out[8]:
In [9]: plt.figure(figsize=(22,8)) sns.scatterplot(x="date", y="num_sequences", data=df, hue="variant")
Out[9]:
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 3/16
array(['Angola', 'Argentina', 'Aruba', 'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium', 'Belize', 'Benin', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Cambodia', 'Cameroon', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Croatia', 'Curacao', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kosovo', 'Kuwait', 'Latvia', 'Lebanon', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Malta', 'Mauritius', 'Mexico', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Nepal', 'Netherlands', 'New Zealand', 'Nigeria', 'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Rwanda', 'Senegal', 'Serbia', 'Seychelles', 'Singapore', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Suriname', 'Sweden', 'Switzerland', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Turkey', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay', 'Vietnam', 'Zambia', 'Zimbabwe'], dtype=object)
array(['Alpha', 'B.1.1.277', 'B.1.1.302', 'B.1.1.519', 'B.1.160', 'B.1.177', 'B.1.221', 'B.1.258', 'B.1.367', 'B.1.620', 'Beta', 'Delta', 'Epsilon', 'Eta', 'Gamma', 'Iota', 'Kappa', 'Lambda', 'Mu', 'Omicron', 'S:677H.Robin1', 'S:677P.Pelican', 'others', 'non_who'], dtype=object)
In [10]: df.location.unique()
Out[10]:
In [11]: df.variant.unique()
Out[11]:
In [12]: for virus in df.variant.unique(): most_cases = df.loc[df['variant'] == virus].groupby('location')['num_sequences'].ag most_cases = pd.DataFrame({'Location':most_cases.index, 'Number of Case':most_cases plt.figure(figsize=(20,8)) sns.barplot(y='Location',x="Number of Case",data=most_cases,palette="plasma_r") plt.title('COUNTRIES HAVE MORE {} CASES THAN OTHERS'.format(virus).upper(),loc='cen
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 4/16
C:\Users\shahk\AppData\Local\Temp/ipykernel_4392/2010383229.py:4: RuntimeWarning: More t han 20 figures have been opened. Figures created through the pyplot interface (`matplotl ib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). plt.figure(figsize=(20,8))
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 5/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 6/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 7/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 8/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 9/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 10/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 11/16
location variant num_sequences perc_sequences num_sequences_total month year day
In [13]: df['month'] = df['date'].apply(lambda date: date.month) df['year'] = df['date'].apply(lambda date: date.year) df['day'] = df['date'].apply(lambda date: date.day)
In [14]: df.drop('date',axis=1, inplace=True) df.head()
Out[14]:
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 12/16
location variant num_sequences perc_sequences num_sequences_total month year day
0 Angola Alpha 0 0.0 3 7 2020 6
1 Angola B.1.1.277 0 0.0 3 7 2020 6
2 Angola B.1.1.302 0 0.0 3 7 2020 6
3 Angola B.1.1.519 0 0.0 3 7 2020 6
4 Angola B.1.160 0 0.0 3 7 2020 6
In [15]: df_val1 = df.loc[df["variant"]== virus].groupby('month')['num_sequences'].agg('sum').so df_val1 = pd.DataFrame({'Month':df_val1.index, 'Number of Cases':df_val1.values})
In [16]: plt.figure(figsize=(14,8)) sns.barplot(x='Month', y='Number of Cases',data=df_val1); plt.title('Monthly Cases Ratio Of All Summed Variant',fontweight="bold");
In [17]: df_val1 = df.loc[df["variant"]== virus].groupby('day')['num_sequences'].agg('sum').sort_ df_val1 = pd.DataFrame({'Day':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8)) sns.barplot(x='Day', y='Number of Cases',data=df_val1); plt.title('Daily Cases Ratio Of All Summed Variant',fontweight="bold");
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 13/16
In [18]: df_val1 = df.loc[df["variant"]== virus].groupby('year')['num_sequences'].agg('sum').sor df_val1 = pd.DataFrame({'Year':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8)) sns.barplot(x='Year', y='Number of Cases',data=df_val1); plt.title('Over All Cases Ratio With Year Of All Variant',fontweight="bold");
In [19]: df_val1 = df.loc[df["variant"]== 'Omicron'].groupby('month')['num_sequences'].agg('sum' df_val1 = pd.DataFrame({'Month':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8))
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 14/16
sns.barplot(x='Month', y='Number of Cases',data=df_val1); plt.title('Omicron Cases Montly Ratio',fontweight="bold");
In [20]: df_val1 = df.loc[df["variant"]== 'Omicron'].groupby('day')['num_sequences'].agg('sum'). df_val1 = pd.DataFrame({'Day':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8)) sns.barplot(x='Day', y='Number of Cases',data=df_val1); plt.title('Omicron Cases Daily Ratio',fontweight="bold");
In [21]: df_val1 = df.loc[df["variant"]== 'Omicron'].groupby('year')['num_sequences'].agg('sum')
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 15/16
df_val1 = pd.DataFrame({'Year':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8)) sns.barplot(x='Year', y='Number of Cases',data=df_val1); plt.title('Omicron Cases Yearly Ratio',fontweight="bold");
In [22]: df_val1 = df.loc[df["variant"]== 'Omicron'].groupby('location')['num_sequences'].agg('s df_val1 = pd.DataFrame({'Location':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(16,8)) sns.barplot(x='Location', y='Number of Cases',data=df_val1); plt.title('Highest Omicron Cases Location',fontweight="bold"); plt.xticks(rotation=30);
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 16/16
In [ ]: