Student Activity
# Import necessary libraries
import pandas as pd
import numpy as np
# ===============================
# 1. Create a Sample DataFrame
# ===============================
# For demonstration, we create a DataFrame with some missing values.
data = {
'Age': [25, 30, 22, 40, np.nan, 35, 28, 45, 29, 31],
'Salary': [50000, 60000, 45000, 80000, 70000, np.nan, 52000, 85000, 48000, 62000],
'Department': ['HR', 'IT', 'IT', 'Finance', 'Finance', 'HR', 'IT', 'Finance', 'HR', np.nan]
}
df = pd.DataFrame(data)
print("Initial DataFrame:")
print(df)
# ======================================================
# 2. Descriptive Statistics and Summary Functions
# ======================================================
# Get summary statistics for numerical columns.
print("\nDescriptive Statistics for Numerical Features:")
print(df.describe())
# Summary for categorical feature: Count the occurrences (including NaN)
print("\nSummary for Categorical Feature 'Department':")
print(df['Department'].value_counts(dropna=False))
# ============================================
# 3. Data Cleaning: Handling Missing Values
# ============================================
# Option 1: Remove rows that contain any missing values.
df_dropped = df.dropna()
print("\nDataFrame after dropping rows with missing values:")
print(df_dropped)
# Option 2: Fill missing values.
# Create a copy of the original DataFrame for filling missing values.
df_filled = df.copy()
# For numeric columns: Fill missing values with the median.
df_filled['Age'] = df_filled['Age'].fillna(df_filled['Age'].median())
df_filled['Salary'] = df_filled['Salary'].fillna(df_filled['Salary'].median())
# For categorical columns: Fill missing values with the most frequent value (mode).
df_filled['Department'] = df_filled['Department'].fillna(df_filled['Department'].mode()[0])
print("\nDataFrame after filling missing values:")
print(df_filled)
# =====================================================
# 4. Data Transformation: Normalizing the 'Salary'
# =====================================================
# Normalize the 'Salary' column using min-max scaling.
df_filled['Salary_normalized'] = (df_filled['Salary'] - df_filled['Salary'].min()) / (df_filled['Salary'].max() - df_filled['Salary'].min())
print("\nDataFrame after normalizing the 'Salary' column:")
print(df_filled)
# ========================================================
# 5. Using Pandas Profiling for Exploratory Data Analysis
# ========================================================
# Note: To run this section, install pandas-profiling using:
# pip install pandas-profiling
# Uncomment the code below if pandas-profiling is installed.
"""
from pandas_profiling import ProfileReport
# Generate a detailed report for the cleaned DataFrame.
profile = ProfileReport(df_filled, title="Pandas Profiling Report", explorative=True)
profile.to_file("pandas_profiling_report.html")
print("\nPandas Profiling Report has been generated and saved as 'pandas_profiling_report.html'")
"""
# =======================================================
# 6. Using Sweetviz for Exploratory Data Analysis
# =======================================================
# Note: To run this section, install sweetviz using:
# pip install sweetviz
# Uncomment the code below if sweetviz is installed.
"""
import sweetviz as sv
# Generate an interactive EDA report.
report = sv.analyze(df_filled)
report.show_html("sweetviz_report.html")
print("\nSweetviz report has been generated and saved as 'sweetviz_report.html'")
"""
# ============================
# End of EDA Demonstration
# ============================Guide & Explanation
Last updated