Data Analysis

profilekshah9286
EDA2covid-variants.pdf

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 1/16

location date variant num_sequences perc_sequences num_sequences_total

0 Angola 2020-07-06 Alpha 0 0.0 3

1 Angola 2020-07-06 B.1.1.277 0 0.0 3

2 Angola 2020-07-06 B.1.1.302 0 0.0 3

3 Angola 2020-07-06 B.1.1.519 0 0.0 3

4 Angola 2020-07-06 B.1.160 0 0.0 3

<class 'pandas.core.frame.DataFrame'> RangeIndex: 100416 entries, 0 to 100415 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 location 100416 non-null object 1 date 100416 non-null datetime64[ns] 2 variant 100416 non-null object 3 num_sequences 100416 non-null int64 4 perc_sequences 100416 non-null float64 5 num_sequences_total 100416 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(2), object(2) memory usage: 4.6+ MB

Alpha 4184 B.1.1.277 4184 others 4184 S:677P.Pelican 4184 S:677H.Robin1 4184 Omicron 4184 Mu 4184

In [1]: import numpy as np import pandas as pd

In [2]: import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline

In [3]: from sklearn.linear_model import LinearRegression

In [4]: df = pd.read_csv('covid-variants.csv', parse_dates=["date"]) df.head()

Out[4]:

In [5]: df.info()

In [6]: df.variant.value_counts()

Out[6]:

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 2/16

Lambda 4184 Kappa 4184 Iota 4184 Gamma 4184 Eta 4184 Epsilon 4184 Delta 4184 Beta 4184 B.1.620 4184 B.1.367 4184 B.1.258 4184 B.1.221 4184 B.1.177 4184 B.1.160 4184 B.1.1.519 4184 B.1.1.302 4184 non_who 4184 Name: variant, dtype: int64

0 84173 1 2753 2 1405 3 905 4 631 ... 1690 1 1719 1 2156 1 1184 1 862 1 Name: num_sequences, Length: 1563, dtype: int64

num_sequences perc_sequences num_sequences_total

count 100416.000000 100416.000000 100416.000000

mean 72.171676 6.154355 1509.582457

std 1669.262169 21.898989 8445.291772

min 0.000000 -0.010000 1.000000

25% 0.000000 0.000000 12.000000

50% 0.000000 0.000000 59.000000

75% 0.000000 0.000000 394.000000

max 142280.000000 100.000000 146170.000000

<AxesSubplot:xlabel='date', ylabel='num_sequences'>

In [7]: df.num_sequences.value_counts()

Out[7]:

In [8]: df.describe()

Out[8]:

In [9]: plt.figure(figsize=(22,8)) sns.scatterplot(x="date", y="num_sequences", data=df, hue="variant")

Out[9]:

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 3/16

array(['Angola', 'Argentina', 'Aruba', 'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium', 'Belize', 'Benin', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Cambodia', 'Cameroon', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Croatia', 'Curacao', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kosovo', 'Kuwait', 'Latvia', 'Lebanon', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Malta', 'Mauritius', 'Mexico', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Nepal', 'Netherlands', 'New Zealand', 'Nigeria', 'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Rwanda', 'Senegal', 'Serbia', 'Seychelles', 'Singapore', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Suriname', 'Sweden', 'Switzerland', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Turkey', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay', 'Vietnam', 'Zambia', 'Zimbabwe'], dtype=object)

array(['Alpha', 'B.1.1.277', 'B.1.1.302', 'B.1.1.519', 'B.1.160', 'B.1.177', 'B.1.221', 'B.1.258', 'B.1.367', 'B.1.620', 'Beta', 'Delta', 'Epsilon', 'Eta', 'Gamma', 'Iota', 'Kappa', 'Lambda', 'Mu', 'Omicron', 'S:677H.Robin1', 'S:677P.Pelican', 'others', 'non_who'], dtype=object)

In [10]: df.location.unique()

Out[10]:

In [11]: df.variant.unique()

Out[11]:

In [12]: for virus in df.variant.unique(): most_cases = df.loc[df['variant'] == virus].groupby('location')['num_sequences'].ag most_cases = pd.DataFrame({'Location':most_cases.index, 'Number of Case':most_cases plt.figure(figsize=(20,8)) sns.barplot(y='Location',x="Number of Case",data=most_cases,palette="plasma_r") plt.title('COUNTRIES HAVE MORE {} CASES THAN OTHERS'.format(virus).upper(),loc='cen

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 4/16

C:\Users\shahk\AppData\Local\Temp/ipykernel_4392/2010383229.py:4: RuntimeWarning: More t han 20 figures have been opened. Figures created through the pyplot interface (`matplotl ib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). plt.figure(figsize=(20,8))

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 5/16

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 6/16

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 7/16

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 8/16

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 9/16

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 10/16

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 11/16

location variant num_sequences perc_sequences num_sequences_total month year day

In [13]: df['month'] = df['date'].apply(lambda date: date.month) df['year'] = df['date'].apply(lambda date: date.year) df['day'] = df['date'].apply(lambda date: date.day)

In [14]: df.drop('date',axis=1, inplace=True) df.head()

Out[14]:

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 12/16

location variant num_sequences perc_sequences num_sequences_total month year day

0 Angola Alpha 0 0.0 3 7 2020 6

1 Angola B.1.1.277 0 0.0 3 7 2020 6

2 Angola B.1.1.302 0 0.0 3 7 2020 6

3 Angola B.1.1.519 0 0.0 3 7 2020 6

4 Angola B.1.160 0 0.0 3 7 2020 6

In [15]: df_val1 = df.loc[df["variant"]== virus].groupby('month')['num_sequences'].agg('sum').so df_val1 = pd.DataFrame({'Month':df_val1.index, 'Number of Cases':df_val1.values})

In [16]: plt.figure(figsize=(14,8)) sns.barplot(x='Month', y='Number of Cases',data=df_val1); plt.title('Monthly Cases Ratio Of All Summed Variant',fontweight="bold");

In [17]: df_val1 = df.loc[df["variant"]== virus].groupby('day')['num_sequences'].agg('sum').sort_ df_val1 = pd.DataFrame({'Day':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8)) sns.barplot(x='Day', y='Number of Cases',data=df_val1); plt.title('Daily Cases Ratio Of All Summed Variant',fontweight="bold");

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 13/16

In [18]: df_val1 = df.loc[df["variant"]== virus].groupby('year')['num_sequences'].agg('sum').sor df_val1 = pd.DataFrame({'Year':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8)) sns.barplot(x='Year', y='Number of Cases',data=df_val1); plt.title('Over All Cases Ratio With Year Of All Variant',fontweight="bold");

In [19]: df_val1 = df.loc[df["variant"]== 'Omicron'].groupby('month')['num_sequences'].agg('sum' df_val1 = pd.DataFrame({'Month':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8))

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 14/16

sns.barplot(x='Month', y='Number of Cases',data=df_val1); plt.title('Omicron Cases Montly Ratio',fontweight="bold");

In [20]: df_val1 = df.loc[df["variant"]== 'Omicron'].groupby('day')['num_sequences'].agg('sum'). df_val1 = pd.DataFrame({'Day':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8)) sns.barplot(x='Day', y='Number of Cases',data=df_val1); plt.title('Omicron Cases Daily Ratio',fontweight="bold");

In [21]: df_val1 = df.loc[df["variant"]== 'Omicron'].groupby('year')['num_sequences'].agg('sum')

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 15/16

df_val1 = pd.DataFrame({'Year':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(14,8)) sns.barplot(x='Year', y='Number of Cases',data=df_val1); plt.title('Omicron Cases Yearly Ratio',fontweight="bold");

In [22]: df_val1 = df.loc[df["variant"]== 'Omicron'].groupby('location')['num_sequences'].agg('s df_val1 = pd.DataFrame({'Location':df_val1.index, 'Number of Cases':df_val1.values}) plt.figure(figsize=(16,8)) sns.barplot(x='Location', y='Number of Cases',data=df_val1); plt.title('Highest Omicron Cases Location',fontweight="bold"); plt.xticks(rotation=30);

2/2/22, 7:33 PM EDA 2 covid-variants

localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 16/16

In [ ]: