titanix data analysis edureka

titanic data analysis edureka
In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

titanic_data= pd.read_csv(r"C:\Users\huzaifa\Downloads\titanicdataset-traincsv\train.csv")
titanic_data.head(10)
Out[6]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
In [10]:
print("# of passengers in original data: " +str(len(titanic_data.index)))
# of passengers in original data: 891
In [11]:
# Analyzing data 550 did not survived 300 survived
sns.countplot(x="Survived", data=titanic_data)
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x2640abdad08>
In [14]:
sns.countplot(x="Survived", hue="Sex", data=titanic_data)
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x2640c4d0508>
In [15]:
sns.countplot(x="Survived", hue="Pclass", data=titanic_data)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x2640c551408>
In [16]:
titanic_data['Age'].plot.hist()
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x2640c5c8d08>
In [20]:
titanic_data['Fare'].plot.hist(bins=20, figsize=(10,5))
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x2640c6ce448>
In [21]:
titanic_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
In [22]:
sns.countplot(x="SibSp", data=titanic_data)
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x2640c783208>
In [23]:
#data wrangling 
titanic_data.isnull()
Out[23]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 False False False False False False False False False False True False
1 False False False False False False False False False False False False
2 False False False False False False False False False False True False
3 False False False False False False False False False False False False
4 False False False False False False False False False False True False
... ... ... ... ... ... ... ... ... ... ... ... ...
886 False False False False False False False False False False True False
887 False False False False False False False False False False False False
888 False False False False False True False False False False True False
889 False False False False False False False False False False False False
890 False False False False False False False False False False True False
891 rows × 12 columns
In [24]:
titanic_data.isnull().sum()
Out[24]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
In [28]:
sns.heatmap(titanic_data.isnull(), yticklabels=False, cmap="viridis")
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x2640ca0be88>
In [31]:
#drop age columns
sns.boxplot(x="Pclass",y="Age", data=titanic_data)
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x2640c9eb308>
In [32]:
titanic_data.head(5)
Out[32]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [34]:
titanic_data.drop("Cabin", axis=1, inplace=True)
In [35]:
titanic_data.head(5)
Out[35]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
In [36]:
titanic_data.dropna(inplace=True)
In [37]:
sns.heatmap(titanic_data.isnull(), yticklabels=False, cbar=False)
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x2640fce2bc8>
In [38]:
titanic_data.isnull().sum()
Out[38]:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
In [39]:
titanic_data.head(2)
Out[39]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
In [45]:
sex=pd.get_dummies(titanic_data['Sex'],drop_first=True)
sex.head(5)
Out[45]:
male
0 1
1 0
2 0
3 0
4 1
In [49]:
embark = pd.get_dummies(titanic_data['Embarked'],drop_first=True)
embark.head(5)
Out[49]:
Q S
0 0 1
1 0 0
2 0 1
3 0 1
4 0 1
In [55]:
pcl = pd.get_dummies(titanic_data['Pclass'],drop_first=True)
pcl.head(5)
Out[55]:
2 3
0 0 1
1 0 0
2 0 1
3 0 0
4 0 1
In [56]:
#concatenate
titanic_data=pd.concat([titanic_data,sex,embark,pcl],axis=1)
In [57]:
titanic_data.head(5)
Out[57]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked male Q S 2 3
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S 1 0 1 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C 0 0 0 0 0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S 0 0 1 0 1
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S 0 0 1 0 0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S 1 0 1 0 1
In [63]:
titanic_data.drop(['Sex','Embarked','PassengerId','Name','Ticket','Pclass'],axis=1,inplace=True)
In [64]:
titanic_data.head(5)
Out[64]:
Survived Age SibSp Parch Fare male Q S 2 3
0 0 22.0 1 0 7.2500 1 0 1 0 1
1 1 38.0 1 0 71.2833 0 0 0 0 0
2 1 26.0 0 0 7.9250 0 0 1 0 1
3 1 35.0 1 0 53.1000 0 0 1 0 0
4 0 35.0 0 0 8.0500 1 0 1 0 1
In [65]:
#training dataset
x=titanic_data.drop("Survived", axis=1)
y=titanic_data["Survived"]
In [68]:
from sklearn.model_selection import train_test_split
In [71]:
 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
In [79]:
from sklearn.linear_model import LogisticRegression
In [80]:
logmodel=LogisticRegression()
In [82]:
logmodel.fit(X_train,y_train)
C:\Users\huzaifa\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[82]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [83]:
prediction = logmodel.predict(X_test)
In [84]:
#accuracy
from sklearn.metrics import classification_report
In [87]:
classification_report(y_test,prediction)
Out[87]:
'              precision    recall  f1-score   support\n\n           0       0.81      0.83      0.82       126\n           1       0.75      0.72      0.73        88\n\n    accuracy                           0.79       214\n   macro avg       0.78      0.77      0.78       214\nweighted avg       0.78      0.79      0.78       214\n'
In [88]:
from sklearn.metrics import confusion_matrix
In [89]:
confusion_matrix(y_test,prediction)
Out[89]:
array([[105,  21],
       [ 25,  63]], dtype=int64)
In [90]:
from sklearn.metrics import accuracy_score
In [92]:
accuracy_score(y_test,prediction)*100
Out[92]:
78.50467289719626
In [ ]:
>

Comments

Popular posts from this blog

Interview Preparation Kit

Dinosaurus_Island_Character_level_language_model

How to crack the interviews and get a decent job?