首页 > 代码库 > ydb数据去重

ydb数据去重

1.创建ydb导出表vehiclepass_txt_export
create external table vehiclepass_txt_export(
rowKey string, regionID string, cityID string, equipmentCode string, directionCode string,laneCode string,inOutTownFlag string,tollCode string,communityID string,tollTypeCode string,dataCompanyCode string,vehiclePlateSrc string,speed int,time string,url string,year string,month string,day string,hour string,week string,dataClassify string,vehiclePlate string,plateColor string,vehicleColor string,brand string,childBrand string,model string,species string,manufacturer string,headOrTail string,brandYear string,reliability int,vehicleTypeCode string,vehiclePosition string,firstTimeInCity int,firstTimeInCounty int,firstTimeInCommunity int,iword string,ivehiclePlate string,analysisData string
) PARTITIONED BY ( `month1` string)
location ‘/data/mr/vehiclepass_txt_export‘;
2.将ydb表vehiclepass分区为201603的数据导出到外部表vehiclepass_export_txt
insert overwrite table vehiclepass_txt_export partition(month1=‘201604‘)
select
r[0] as rowKey,
r[1] as regionID,
r[2] as cityID,
r[3] as equipmentCode,
r[4] as directionCode,
r[5] as laneCode,
r[6] as inOutTownFlag,
r[7] as tollCode,
r[8] as communityID,
r[9] as tollTypeCode,
r[10] as dataCompanyCode,
r[11] as vehiclePlateSrc,
r[12] as speed,
r[13] as time,
r[14] as url,
r[15] as year,
r[16] as month,
r[17] as day,
r[18] as hour,
r[19] as week,
r[20] as dataClassify,
r[21] as vehiclePlate,
r[22] as plateColor,
r[23] as vehicleColor,
r[24] as brand,
r[25] as childBrand,
r[26] as model,
r[27] as species,
r[28] as manufacturer,
r[29] as headOrTail,
r[30] as brandYear,
r[31] as reliability,
r[32] as vehicleTypeCode,
r[33] as vehiclePosition,
r[34] as firstTimeInCity,
r[35] as firstTimeInCounty,
r[36] as firstTimeInCommunity,
r[37] as iword,
r[38] as ivehiclePlate,
r[39] as analysisData
from ydb where YSQL(‘from‘,‘select rowKey,regionID,cityID,equipmentCode,directionCode,laneCode,inOutTownFlag,tollCode,communityID,tollTypeCode,dataCompanyCode,vehiclePlateSrc,speed,time,url,year,month,day,hour,week,dataClassify,vehiclePlate,plateColor,vehicleColor,brand,childBrand,model,species,manufacturer,headOrTail,brandYear,reliability,vehicleTypeCode,vehiclePosition,firstTimeInCity,firstTimeInCounty,firstTimeInCommunity,iword,ivehiclePlate,analysisData from vehiclepass where ydbpartion = \‘201604\‘ ‘,‘segment‘);

###删除导出表指定分区数据ALTER TABLE vehiclepass_txt_export DROP IF EXISTS PARTITION (month1=‘201603‘);
3.将表vehiclepass_export_txt中201603的数据去重保存到hdfs
/hadoop/bin/hadoop jar DereplicationYunnan.jar hdfs://10.166.112.190:9000/data/mr/vehiclepass_txt_export/month1=201603 hdfs://10.166.112.190:9000/data/mr/vehiclepass/201603
//将去重后的数据进行备份
hadoop fs -cp -f /data/mr/vehiclepass/201603 /data/mr/vehiclepass_bak
4.将去重后的数据导入hive表vehiclepass_hive(同时会自动将/data/mr/vehiclepass/201603/part-r-00000去掉)
load data inpath ‘/data/mr/vehiclepass/201603/part-r-00000‘ into table vehiclepass_hive partition(month1=201603);
5.清除ydb对应分区的数据
***************************************************
select count(r[0]) as cnt from
ydb where YSQL(‘from‘,‘select count(*) from vehiclepass where ydbpartion=\‘201603\‘ and ydbkv=\‘ydb.truncate:true\‘ ‘,‘segment‘)
***************************************************
http://hwits-master:9999/droppartion?tablename=vehiclepass&partions=201603
http://hwits-master:9999/droppartion?tablename=vehiclepass&partions=201603&isforce=true

6.将hive表vhiclepass_hive的数据导入ydb表vehiclepass
###insert into table vehiclepass select *,‘201603‘,‘‘ from vehiclepass_hive;
###将表的字段按列名对应写
INSERT INTO TABLE vehiclepass SELECT rowKey,regionID,
cityID,
equipmentCode,
directionCode,
laneCode,
inOutTownFlag,
tollCode,
communityID,
tollTypeCode,
dataCompanyCode,
vehiclePlateSrc,
speed,
time,
url,
YEAR,
MONTH,
DAY,
HOUR,
WEEK,
dataClassify,
vehiclePlate,
plateColor,
vehicleColor,
brand,
childBrand,
model,
species,
manufacturer,
headOrTail,
brandYear,
reliability,
vehicleTypeCode,
vehiclePosition,
firstTimeInCity,
firstTimeInCounty,
firstTimeInCommunity,
iword,
ivehiclePlate,
analysisData,
‘201603‘,
‘‘
FROM
vehiclepass_hive;
#######################################################################
ALTER TABLE vehiclepass_hive DROP IF EXISTS PARTITION (month1=‘201603‘);
确认:
select count(*) from vehiclepass_hive;

#####################
vehiclepass_hive

CREATE TABLE vehiclepass_hive
(
rowKey string,
regionID string,
cityID string,
equipmentCode string,
directionCode string,
laneCode string,
inOutTownFlag string,
tollCode string,
communityID string,
tollTypeCode string,
dataCompanyCode string,
vehiclePlateSrc string,
speed int,
time string,
url string,
year string,
month string,
day string,
hour string,
week string,
dataClassify string,
vehiclePlate string,
plateColor string,
vehicleColor string,
brand string,
childBrand string,
model string,
species string,
manufacturer string,
headOrTail string,
brandYear string,
reliability int,
vehicleTypeCode string,
vehiclePosition string,
firstTimeInCity int,
firstTimeInCounty int,
firstTimeInCommunity int,
iword string,
ivehiclePlate string,
analysisData string
)
PARTITIONED BY(month1 STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,‘ STORED AS TEXTFILE;

ydb数据去重