Basic Pandas Project

reading csv files and performing basic functions on data frames.

import numpy as np
import pandas as pd
df=pd.read_csv('diabetes.csv')
df.head()
# Calculate the mean using pandas
mean_values = df.mean()
print("Mean values of each numerical column:\n", mean_values)
# Find the maximum value using pandas
max_values = df.max()
print("Maximum value in each column:\n", max_values)
# Calculate the median using pandas
median_values = df.median()
print("Median values of each numerical column:\n", median_values)
# Calculate the standard deviation using pandas
std_dev_values = df.std()
print("Standard deviation of each numerical column:\n", std_dev_values)
# Check for missing values using pandas
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)
# Sort the DataFrame by 'Age' column in descending order using pandas
sorted_df = df.sort_values(by='Age', ascending=False)
print("DataFrame sorted by 'Age' column in descending order:\n", sorted_df)
# Select rows where 'Glucose' is greater than 100 using pandas
glucose_gt_100 = df[df['Glucose'] > 100]
print("Rows where 'Glucose' is greater than 100:\n", glucose_gt_100)
# Subsetting the DataFrame based on specific conditions
subset_df = df[(df['Glucose'] > 100) & (df['BMI'] > 30)]
print("Subset of DataFrame where 'Glucose' is greater than 100 and 'BMI' is greater than 30:\n", subset_df)
# Filtering the DataFrame based on a specific condition
filter = df[df['Age'] > 40]
print("Filtered DataFrame where 'Age' is greater than 40:\n", filter)
# Grouping the DataFrame by the 'Gendee' column and calculating the mean of other columns for each group
group = df.groupby('gender').mean()
print("Mean values of numerical columns grouped by 'Outcome':\n", group)

[data is collected from Kaggle. ]

  • head: Displays the first few rows of a Data-Frame in Pandas.

  • info: Provides a summary of the Data-Frame including the data types

  • tail: Displays the last few rows of a data frame in Pandas.

  • shape: Returns a tuple representing the dimensions of the Data-Frame.

  • Describe(): method in Pandas generates descriptive statistics of the numerical columns in a DataFrame