import findspark
findspark.init()

import pyspark

from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("StudentGrades") \
    .getOrCreate()

# Sample student scores
scores = [
    ("Alice", {"Math": 85, "Science": 90, "English": 80}),
    ("Bob", {"Math": 70, "Science": 75, "English": 85}),
    ("Charlie", {"Math": 60, "Science": 65, "English": 70}),
    ("David", {"Math": 90, "Science": 95, "English": 85}),
    ("Eve", {"Math": 75, "Science": 80, "English": 75})
]

# Create RDD from the scores
scores_rdd = spark.sparkContext.parallelize(scores)

# Define the grading scheme (example)
grading_scheme = {
    "A": (80, 100),
    "B": (60, 79),
    "C": (40, 59),
    "D": (0, 39)
}

# Function to compute grades for a given score
def compute_grade(score):
    for grade, (lower_bound, upper_bound) in grading_scheme.items():
        if lower_bound <= score <= upper_bound:
            return grade
    return "F"

# Map operation to compute grades for each student
grades_rdd = scores_rdd.map(lambda x: (x[0], {subject: compute_grade(score) for subject, score in x[1].items()}))

# Convert RDD to DataFrame
grades_df = spark.createDataFrame(grades_rdd.flatMap(lambda x: [(x[0], subject, grade) for subject, grade in x[1].items()]), ["Student", "Subject", "Grade"])

# Display the result
grades_df.show()

# Stop SparkSession
spark.stop()
A9

Leave a Comment Cancel Reply