首页 > 代码库 > hive中文乱码-解析json

hive中文乱码-解析json

--最近在做手机型号维度分析统计中,手机型号出现中文乱码的现象,查找的最终的原因的,在json解析的过程中,出现乱码,定位是分隔符的问题 ‘||‘

解决思路:用split函数的(||)切割json和ip,最终没有乱码的现象,但是问题解决了,不知是什么原因,估计是其中类的原因

select phone_mode_name,phone_brand_name from intdata.kesheng_sdk_device where phone_mode_name like ‘%?%‘ or phone_brand_name and src_file_day = 20170124 like ‘%?%‘ limit 3 ;
Total MapReduce CPU Time Spent: 5 minutes 2 seconds 850 msec
OK
?? GT-i9220     ??
?? GT-S5830     ??
?? GT-S5830     ??
Time taken: 25.528 seconds, Fetched: 3 row(s)


--create table
CREATE TABLE intdata.kesheng_sdk_device(
        device_id          string,
        imei               string,
        idfa               string,
        idfv               string,
        imsi               string,
        sim                string,
        udid               string,
        os_type_code       string,
        macId              string,
        app_ver_code       string,
        phone_mode_name    string,
        phone_brand_name   string,
        phone_number       string,
        sreen_width        string,
        sreen_height       string,
        sreen_density      string,
        app_key            string,
        app_pkg_name       string,
        app_channel_id     string,
        user_id            string,
        language_code      string,
        upload_unix_time   string,
        os_ver_code        string,
        sdk_ver_code       string,
        is_BTB_flag        string,
        back_cnt           bigint,
        install_id         string        
)
partitioned by(src_file_day string)
clustered by(device_id) into 128 buckets;

---
add jar /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
set mapreduce.job.name=kesheng_sdk_device_${EXTRACT_DATE};
set hive.enforce.bucketing=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;

INSERT OVERWRITE TABLE intdata.kesheng_sdk_device PARTITION(src_file_day)
SELECT
    concat_ws(‘_‘, nvl(install_id, ‘‘),  nvl(COALESCE(imei, idfa), ‘‘)) device_id,
    imei,
    idfa,
    idfv,
    imsi,
    sim,
    udid,
    os_type,
    macId,
    app_ver_code,
    phone_mode_name,
    phone_brand_name,
    phone_number,
    sreen_width,
    sreen_height,
    sreen_density,
    app_key,
    app_pkg_name,
    app_channel_id,
    user_id,
    language_code,
    upload_unix_time,
    os_ver_code,
    sdk_ver_code,
    is_BTB_flag,
    back_cnt,
    install_id,
    src_file_day
FROM (SELECT
         v.imei,
         v.idfa,
         v.idfv,
         v.imsi,
         v.sim,
         v.udid,
         v.os as os_type,
         v.macId,
         v.appVersion as app_ver_code,
         v.phoneMode as phone_mode_name,
         v.phoneBrand as phone_brand_name,
         v.phoneNumber as phone_number,
         v.sreenWidth as sreen_width,
         v.sreenHeight as sreen_height,
         v.sreenDensity as sreen_density,
         v.appkey as app_key,
         v.apppkg as app_pkg_name,
         v.appchannel as app_channel_id,
         v.userid as user_id,
         v.language as language_code,
         cast(v.uploadTs as bigint) as upload_unix_time,
         v.osversion as os_ver_code,
         v.sdkversion as sdk_ver_code,
         v.isBTB as is_BTB_flag,
         cast(v.backCount as bigint) as back_cnt,
         v.installationID as install_id,
         v.extract_date_label as src_file_day,
         row_number() over(partition by v.installationID, v.imei, v.idfa order by v.uploadTs desc) row_num
    FROM ods.kesheng_sdk_deviceinfo_v v
    WHERE v.extract_date_label = ‘${EXTRACT_DATE}‘ and
         (
          (v.os = ‘AD‘ and (v.installationID is not null or v.imei is not null))
          or (v.os = ‘iOS‘ and (v.installationID is not null or v.idfa is not null))
         )
    ) x
WHERE row_num = 1;


---
CREATE OR REPLACE VIEW ods.kesheng_sdk_deviceinfo_v
AS
SELECT concat(s.INPUT__FILE__NAME, ‘:‘, s.BLOCK__OFFSET__INSIDE__FILE) as rowkey,
       d.os,
       d.imei,
       d.imsi,
       d.idfa,
       d.idfv,
       d.sim,
       d.macId,
       d.appVersion,
       d.phoneMode,
       d.phoneBrand,
       d.phoneNumber,
       d.sreenWidth,
       d.sreenHeight,
       d.sreenDensity,
       d.appkey,
       d.apppkg,
       d.appchannel,
       d.userid,
       d.language,
       d.isBTB,
       d.backCount,
       d.uploadTs,
       d.clientId,
       d.sessionId,
       d.osversion,
       d.sdkversion,
       d.ints,
       d.networktype,
       d.udid,
       d.installationID,
       s.client_ip,
       s.extract_date_label,
       s.extract_hour_label
FROM ods.kesheng_sdk_json_ex_v s
LATERAL VIEW json_tuple(s.json, ‘deviceInfo‘) p as deviceInfo
LATERAL VIEW json_tuple(p.deviceInfo, ‘os‘, ‘imei‘, ‘imsi‘, ‘idfa‘, ‘idfv‘, ‘sim‘, ‘macId‘, ‘appVersion‘, ‘phoneMode‘, ‘phoneBrand‘, ‘phoneNumber‘, ‘sreenWidth‘, ‘sreenHeight‘, ‘sreenDensity‘, ‘appkey‘, ‘apppkg‘, ‘appchannel‘, ‘userid‘, ‘language‘, ‘isBTB‘, ‘backCount‘, ‘uploadTs‘, ‘clientId‘, ‘sessionId‘, ‘osversion‘, ‘sdkversion‘, ‘ints‘, ‘networktype‘, ‘udid‘, ‘installationID‘) d AS os, imei, imsi, idfa, idfv, sim, macId, appVersion, phoneMode, phoneBrand, phoneNumber, sreenWidth, sreenHeight, sreenDensity, appkey, apppkg, appchannel, userid, language, isBTB, backCount, uploadTs, clientId, sessionId, osversion, sdkversion, ints, networktype, udid, installationID
WHERE p.deviceInfo IS NOT NULL;

----
create or replace view ods.kesheng_sdk_json_ex_v
as
select line[0] json, line[1] client_ip
      ,src_file_day extract_date_label
      ,src_file_hour extract_hour_label
      ,input_file_name ,block_offset_inside_file
  from (select split(t1.line,‘\\|\\|‘) line
              ,src_file_day,src_file_hour
              ,INPUT__FILE__NAME input_file_name  ,BLOCK__OFFSET__INSIDE__FILE block_offset_inside_file
          from ods.kesheng_sdk_raw_ex t1
       ) t2

注意:
分割字符串函数: split
语法: split(string str, stringpat)
返回值: array
说明:按照pat字符串分割str,会返回分割后的字符串数组
举例:
hive> select split(‘abtcdtef‘,‘t‘) from lxw_dual;
["ab","cd","ef"]

   
drop table ods.kesheng_sdk_raw_ex;
create external table ods.kesheng_sdk_raw_ex (
    line string
)
partitioned by (src_file_day string, src_file_hour string)
location ‘/user/hadoop/ods/kesheng/‘;

/* --------------------
-- 补分区
import  datetime

beg_date = datetime.datetime.strptime(‘20170101‘,‘%Y%m%d‘)
end_date = datetime.datetime.strptime(‘20170123‘,‘%Y%m%d‘)

fmt_add_partition = "alter table ods.kesheng_sdk_raw_ex add if not exists partition(src_file_day=‘{0}‘, src_file_hour=‘00‘) location ‘{0}/00‘;"
‘‘‘
while beg_date < end_date:
    beg_date = beg_date + datetime.timedelta(days=1)
    print fmt_add_partition.format(datetime.datetime.strftime(beg_date, ‘%Y%m%d‘))

beg_date = datetime.datetime.strptime(‘20160811‘,‘%Y%m%d‘)
end_date = datetime.datetime.strptime(‘20161109‘,‘%Y%m%d‘)
‘‘‘

fmt_add_partition = "alter table ods.kesheng_sdk_raw_ex add if not exists partition(src_file_day=‘{0}‘, src_file_hour=‘{1}‘) location ‘{0}/{1}‘;"

while beg_date < end_date:
    beg_date = beg_date + datetime.timedelta(days=1)
    for i in xrange(24):
       print fmt_add_partition.format(datetime.datetime.strftime(beg_date, ‘%Y%m%d‘), str(i).rjust(2,‘0‘))
*-----------------------------/

hive中文乱码-解析json