#********** Begin **********#defclean_column_name(col_name):# 删除开始和结束处的空格
col_name = col_name.strip()# 用缩写os替换子字符串Operating System
col_name = col_name.replace('Operating System','OS')# 用下划线替换所有空格
col_name = col_name.replace(' ','_')# 移除括号
col_name = col_name.replace('(','').replace(')','')# 转换为小写
col_name = col_name.lower()return col_name
col_name_list =[]for col_name in laptops.columns :
col = clean_column_name(col_name)
col_name_list.append(col)
laptops.columns = col_name_list
print(laptops.columns)#********** End **********#
第4关 将字符串列转换为数值列
#********** Begin **********#
unique_ram = laptops['ram'].unique()print(unique_ram)#********** End **********#
第5关 删除非数字字符
#********** Begin **********#
laptops["ram"]= laptops["ram"].str.replace('GB','')
unique_ram = laptops['ram'].unique()print(unique_ram)#********** End **********#
第6关 将列转换为数字类型
#********** Begin **********#
laptops["ram"]= laptops["ram"].astype(int)
dtypes = laptops.dtypes
print(dtypes)#********** End **********#
第7关 列的重命名
#********** Begin **********#
laptops.rename({"ram":"ram_gb"}, axis=1, inplace=True)
ram_gb_desc = laptops["ram_gb"].describe()print(ram_gb_desc)#********** End **********#
第8关 从字符串中提取数值
#********** Begin **********#
laptops["cpu_manufacturer"]= laptops["cpu"].str.split(n=1).str[0]
cpu_manufacturer_counts = laptops["cpu_manufacturer"].value_counts()print(cpu_manufacturer_counts)#********** End **********#
第9关 纠正错误值
#********** Begin **********#
laptops["os"]= laptops["os"].map(mapping_dict)print(laptops["os"].value_counts())#********** End **********#
第10关 删除缺失值
#********** Begin **********#
laptops_no_null_rows = laptops.dropna()
laptops_no_null_cols = laptops.dropna(axis=1)print(laptops_no_null_rows.head(20))print(laptops_no_null_cols.head(20))#********** End **********#
第11关 填充缺失值
#********** Begin **********#
value_counts_before = laptops.loc[laptops["os_version"].isnull(),"os"].value_counts()
laptops.loc[laptops["os"]=="No OS","os_version"]="Version Unknown"
laptops.loc[laptops["os"]=="macOS","os_version"]="X"
value_counts_after = laptops.loc[laptops["os_version"].isnull(),"os"].value_counts()print(value_counts_before)print(value_counts_after)#********** End **********#
第12关 挑战:对字符串列进行清洗
#********** Begin **********#
laptops["weight"]= laptops["weight"].str.replace("kgs","")
laptops["weight"]= pd.to_numeric(laptops["weight"].str.replace("kg",""))#laptops.drop("weight", axis=1, inplace=True)
laptops.rename({"weight":"weight_kg"}, axis=1, inplace=True)
laptops.to_csv("laptops_clean.csv", index=False)print(laptops.columns)#********** End **********#