import pandas as pd
def createDataframe(student_data: List[List[int]]) -> pd.DataFrame:
student_df = pd.DataFrame(student_data , columns=("student_id","age"))
return student_df;
if __name__ == '__main__':
student_data = [
[1, 15],
[2, 11],
[3, 11],
[4, 20]
]
df = createDataframe(student_data) #assign the returned dataframe to df.
print(df)
import pandas as pd
def getDataframeSize(players: pd.DataFrame) -> List[int]:
return list(players.shape);
if '__name__' == __main__:
df=pd.DataFrame
rows , columns = getDataframeSize(df)
print(f"This DataFrame contains {rows} rows and {columns} columns.")
Explanation:
(players.shape)
Here, players is the dataframe
In pandas, the
shape
The attribute of a DataFrame returns a tuple.import pandas as pd
def selectFirstRows(employees: pd.DataFrame) -> pd.DataFrame:
return employees.head(3);
if __name__ == '__main__':
df=pd.DataFrame
First_3_rows = selectFirstRows(df)
print(employees)
import pandas as pd
def selectData(students: pd.DataFrame):
student_101 = students.query('student_id == 101')
If not student_101.empty:
return student_101[['name', 'age']]
else:
return df
if __name__ =='__main__':
df= pd.DataFrame
student=selectData(df)
print("\nRows name AND age (using .query()):\n", student)
explanation :
To query we have to use df.query()
Rows name AND age (using .query())
This is a descriptive string that explains what the following output represents. It indicates that the output will show the "name" and "age" columns of a DataFrame, and that the data was selected using the
.query()
method in pandas.import pandas as pd
def createBonusColumn(employees: pd.DataFrame):
employees['bonus'] = employees['salary'] * 2
return employees
if __name__ =='__main__':
df = pd.DataFrame(data)
call_df = createBonusColumn(df)
print(call_df)
explanation :
We can create the new column in the dataframe by
df[column name] = values
here
df[coloumn name ] = df[col1_values]*2
import pandas as pd
def dropDuplicateEmails(customers: pd.DataFrame):
non_duplicated_df=customers.drop_duplicates('email')
return non_duplicated_df
if __name__ == '__main__':
df=pd.DataFrame
call_df=dropDuplicateEmails(df)
print(non_duplicated_df)
explanation :
To remove duplicates, use drop.duplicates()
non_duplicated_df=customers.drop_duplicates('email') --> removing only the duplicates in email coloumn
import pandas as pd
def dropMissingData(students: pd.DataFrame):
new_df= students.dropna(subset = ['name'])
return new_df
if __name__ == '__main__':
df= pd.DataFrame
call_df = dropMissingData(df)
print(new_df)
dropna() is to drop the missing data in the dataframe
subset(['col_name ']) specifies the particular column
import pandas as pd
def modifySalaryColumn(employees: pd.DataFrame) -> pd.DataFrame:
employees['salary'] = employees['salary'].apply(lambda salary: salary * 2)
return employees
if __name__ == '__main__':
df = pd.DataFrame
call_df = modifySalaryColumn(df)
print(call_df)
explanation:
employees['salary'] = employees['salary'].apply(lambda salary: salary * 2)
If we want to modify the column values we have to use lambda functions
import pandas as pd
def renameColumns(students: pd.DataFrame) -> pd.DataFrame:
renamed_students = students.rename(columns={
'id': 'student_id',
'first': 'first_name',
'last': 'last_name',
'age': 'age_in_years'
})
return renamed_students
if __name__ == '__main__':
df= pd.DataFrame
call_df = renameColumns(df)
print (call_df)
explanation : rename (columns ={values}) to change the name
import pandas as pd
explanation :
def changeDatatype(students: pd.DataFrame) -> pd.DataFrame:
students['grade'] = students['grade'].astype(int)
return students
if __name__ == '__main__':
df = pd.DataFrame
call_df = changeDatatype(df)
astype() is used for explicit conversion of the datatype
Comments
Post a Comment