titanix data analysis edureka
In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
titanic_data= pd.read_csv(r"C:\Users\huzaifa\Downloads\titanicdataset-traincsv\train.csv")
titanic_data.head(10)
Out[6]:
In [10]:
print("# of passengers in original data: " +str(len(titanic_data.index)))
In [11]:
# Analyzing data 550 did not survived 300 survived
sns.countplot(x="Survived", data=titanic_data)
Out[11]:
In [14]:
sns.countplot(x="Survived", hue="Sex", data=titanic_data)
Out[14]:
In [15]:
sns.countplot(x="Survived", hue="Pclass", data=titanic_data)
Out[15]:
In [16]:
titanic_data['Age'].plot.hist()
Out[16]:
In [20]:
titanic_data['Fare'].plot.hist(bins=20, figsize=(10,5))
Out[20]:
In [21]:
titanic_data.info()
In [22]:
sns.countplot(x="SibSp", data=titanic_data)
Out[22]:
In [23]:
#data wrangling
titanic_data.isnull()
Out[23]:
In [24]:
titanic_data.isnull().sum()
Out[24]:
In [28]:
sns.heatmap(titanic_data.isnull(), yticklabels=False, cmap="viridis")
Out[28]:
In [31]:
#drop age columns
sns.boxplot(x="Pclass",y="Age", data=titanic_data)
Out[31]:
In [32]:
titanic_data.head(5)
Out[32]:
In [34]:
titanic_data.drop("Cabin", axis=1, inplace=True)
In [35]:
titanic_data.head(5)
Out[35]:
In [36]:
titanic_data.dropna(inplace=True)
In [37]:
sns.heatmap(titanic_data.isnull(), yticklabels=False, cbar=False)
Out[37]:
In [38]:
titanic_data.isnull().sum()
Out[38]:
In [39]:
titanic_data.head(2)
Out[39]:
In [45]:
sex=pd.get_dummies(titanic_data['Sex'],drop_first=True)
sex.head(5)
Out[45]:
In [49]:
embark = pd.get_dummies(titanic_data['Embarked'],drop_first=True)
embark.head(5)
Out[49]:
In [55]:
pcl = pd.get_dummies(titanic_data['Pclass'],drop_first=True)
pcl.head(5)
Out[55]:
In [56]:
#concatenate
titanic_data=pd.concat([titanic_data,sex,embark,pcl],axis=1)
In [57]:
titanic_data.head(5)
Out[57]:
In [63]:
titanic_data.drop(['Sex','Embarked','PassengerId','Name','Ticket','Pclass'],axis=1,inplace=True)
In [64]:
titanic_data.head(5)
Out[64]:
In [65]:
#training dataset
x=titanic_data.drop("Survived", axis=1)
y=titanic_data["Survived"]
In [68]:
from sklearn.model_selection import train_test_split
In [71]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
In [79]:
from sklearn.linear_model import LogisticRegression
In [80]:
logmodel=LogisticRegression()
In [82]:
logmodel.fit(X_train,y_train)
Out[82]:
In [83]:
prediction = logmodel.predict(X_test)
In [84]:
#accuracy
from sklearn.metrics import classification_report
In [87]:
classification_report(y_test,prediction)
Out[87]:
In [88]:
from sklearn.metrics import confusion_matrix
In [89]:
confusion_matrix(y_test,prediction)
Out[89]:
In [90]:
from sklearn.metrics import accuracy_score
In [92]:
accuracy_score(y_test,prediction)*100
Out[92]:
In [ ]:
>
Comments
Post a Comment