import findspark
findspark.init()

import pyspark

from pyspark.sql import SparkSession
import seaborn as sns

# Create a SparkSession
spark = SparkSession.builder \
    .appName("TitanicAnalysis") \
    .getOrCreate()

# Load the Titanic dataset
titanic_df = spark.createDataFrame(sns.load_dataset("titanic"))

titanic_df = titanic_df.fillna({'Age': 0})

# Filter data for male passengers who died and remove null values from Age column
male_deceased = titanic_df.filter((titanic_df["Sex"] == "male") & (titanic_df["Survived"] == 0) & titanic_df["Age"].isNotNull())

# Check if there are any male passengers who died
male_deceased_count = male_deceased.count()

if male_deceased_count > 0:
    # Calculate the average age of male passengers who died
# Calculate the average age of male passengers who died
    male_deceased_age_avg = male_deceased.agg({"Age": "avg"}).collect()[0][0]
    print("Number of male passengers who died:", male_deceased_count)
    print("Average age of male passengers who died:", male_deceased_age_avg)
else:
    print("No male passengers found who died in the dataset.")
    
female_deceased_by_class = titanic_df.filter((titanic_df["Sex"] == "female") & (titanic_df["Survived"] == 0)).groupBy("Pclass").count()

# Display results
print("Number of deceased passengers in each class among females:")
female_deceased_by_class.show()
# Stop SparkSession
spark.stop()
A10

Leave a Comment Cancel Reply