-- 淘宝案例数据准备
create database taobao1123;
use taobao1123;
create table UserBehavior(
user_id int,
item_id int,
item_category int,
behavior_type varchar(10),
user_geohash varchar(10),
times datetime,
amount decimal(5,2)
);
load data infile "C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/UserBehavior.csv"
into table UserBehavior
fields terminated by ',' ignore 1 lines;
select * from UserBehavior limit 10;
select count(*) from UserBehavior;#1048575
-- 1,数据清洗
-- 1.1 缺失值处理
###1.1.1缺失值数量
select
count(user_id),
count(item_id),
count(item_category),
count(behavior_type),
count(user_geohash),
count(times),
count(amount)
from UserBehavior;
###1.1.2缺失值比例
select
sum(user_id is null )/count(*),
sum(item_id is null)/count(*),
sum(item_category is null)/count(*),
sum(behavior_type is null)/count(*),
sum(user_geohash is null)/count(*),
sum(times is null)/count(*),
sum(amount is null)/count(*)
from UserBehavior;
-- 1.2 异常值检查
select min(times),max(times),min(amount),max(amount) from UserBehavior;
-- 1.3 重复记录处理
select distinct * from UserBehavior;
暂无数据