当前位置: 首页 > news >正文

实验三

任务一:导入数据

  • 创建两个数据表
    • Hive QL代码
      USE default;  -- 选择使用名为 default 的数据库DROP TABLE IF EXISTS ccf_offline_stage1_train;
      CREATE TABLE ccf_offline_stage1_train (user_id STRING,merchant_id STRING,coupon_id STRING,discount_rate STRING,distance STRING,date_received STRING,`date` STRING
      ) -- 创建新表,包含7个字段
      -- 这里 data 要打 `` 的原因是data是关键字
      ROW FORMAT DELIMITED
      FIELDS TERMINATED BY ','  -- 指定数据文件使用逗号(,)作为字段分隔符(CSV格式)
      STORED AS TEXTFILE  -- 数据以文本文件格式存储
      TBLPROPERTIES ("skip.header.line.count"="1"
      ); -- 表属性设置:跳过外部数据文件的第一行(通常是标题行)DROP TABLE IF EXISTS ccf_online_stage1_train;
      CREATE TABLE ccf_online_stage1_train (user_id STRING,merchant_id STRING,action STRING,coupon_id STRING,discount_rate STRING,date_received STRING,`date` STRING
      )
      ROW FORMAT DELIMITED
      FIELDS TERMINATED BY ','
      STORED AS TEXTFILE
      TBLPROPERTIES ("skip.header.line.count"="1"
      );LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_offline_stage1_train.csv'
      -- 从本地(LOCAL)文件系统(不是HDFS)加载文件
      OVERWRITE INTO TABLE ccf_offline_stage1_train;
      -- 覆盖写入新数据到之前的表中LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_online_stage1_train.csv'
      OVERWRITE INTO TABLE ccf_online_stage1_train;
      
    • 执行结果
      image
  • 查询验证数据加载成功
    • Hive QL代码
      USE default;SELECT * FROM ccf_offline_stage1_train LIMIT 5;
      -- 从 ccf_offline_stage1_train 表中查询所有字段(* 表示所有列),只返回前五列
      SELECT * FROM ccf_online_stage1_train LIMIT 5;
      SELECT COUNT(*) FROM ccf_offline_stage1_train;
      -- 统计 ccf_offline_stage1_train 表中的总行数
      SELECT COUNT(*) FROM ccf_online_stage1_train;
    • 执行结果
      • 查询
        image
      • 计数
        image
        image

任务二:基本数据查询

  • 查询⽤户⾏为数量
    • Hive QL代码
      USE default;SELECTCASE actionWHEN '0' THEN 'CLICK'WHEN '1' THEN 'BUY'WHEN '2' THEN 'RECEIVE'ELSE CONCAT('UNKNOWN_', action)END AS behavior, -- 将原始的动作编码转换为有意义的英文名称COUNT(*) AS total_count -- 计算每种行为出现的总次数
      FROM ccf_online_stage1_train -- 从名为 ccf_online_stage1_train 的数据表中读取数据
      GROUP BY action -- 按原始的动作编码进行分组统计
      ORDER BY total_count DESC; -- 按行为数量从高到低排序
      
    • 举例
      • 假设原始数据是:0,0,1,0,2,1
      • 分组之后
        组1 (action='0'): 记录1, 记录2, 记录4
        组2 (action='1'): 记录3, 记录6
        组3 (action='2'): 记录5
        
      • 对每个组分别统计行数
        组1: COUNT(*) = 3
        组2: COUNT(*) = 2  
        组3: COUNT(*) = 1
        
    • 执行结果
      image
  • 查询指定商家优惠券使⽤情况
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS onlineconsumption_table;
      CREATE TABLE onlineconsumption_table AS -- 这里的AS是创建表的语法关键字,表示"基于查询结果创建表"
      SELECTmerchant_id,SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NULL THEN 1 ELSE 0 END) AS negative_samples,SUM(CASE WHEN coupon_id IS NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS normal_consumption,SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS positive_samples
      FROM ccf_online_stage1_train
      GROUP BY merchant_id;SELECT * FROM onlineconsumption_table LIMIT 20;
      
    • 执行结果
      image

任务三:数据聚合分析

  • 商家周边活跃顾客数量统计
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS merchant_distance_active_customers;
      CREATE TABLE merchant_distance_active_customers AS
      SELECTmerchant_id,NVL(distance, 'UNKNOWN') AS distance_bucket,COUNT(DISTINCT user_id) AS active_customer_count
      FROM ccf_offline_stage1_train
      GROUP BY merchant_id, NVL(distance, 'UNKNOWN');SELECTmerchant_id,distance_bucket,active_customer_count
      FROM merchant_distance_active_customers
      ORDER BY merchant_id, distance_bucket
      LIMIT 50;
      
    • 执行结果
      image
      image
      其中第一列是商家ID,第二列是距离x,第三列是数目
  • 商家正样本比例统计
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS merchant_positive_ratio;
      CREATE TABLE merchant_positive_ratio AS
      SELECTmerchant_id,positive_samples,negative_samples,normal_consumption,(negative_samples + normal_consumption + positive_samples) AS total_samples,CASEWHEN (negative_samples + normal_consumption + positive_samples) > 0 THENpositive_samples / CAST((negative_samples + normal_consumption + positive_samples) AS DOUBLE)ELSE 0.0END AS positive_ratio
      FROM onlineconsumption_table;SELECTmerchant_id,positive_ratio,positive_samples,total_samples
      FROM merchant_positive_ratio
      ORDER BY positive_ratio DESC, positive_samples DESC
      LIMIT 10;
      
    • 执行结果
      image

任务四:复杂查询与分析

  • 优惠券使用时间统计
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS coupon_usage_interval_stats;
      CREATE TABLE coupon_usage_interval_stats AS
      WITH offline_clean AS (SELECTCASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id,CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed,CASE WHEN date_received IS NOT NULL AND date_received != 'null' THEN date_received ELSE NULL END AS date_receivedFROM ccf_offline_stage1_train
      ),
      coupon_usage AS (SELECTcoupon_id,DATEDIFF(TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_consumed, 'yyyyMMdd'))),TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_received, 'yyyyMMdd')))) AS usage_intervalFROM offline_cleanWHERE coupon_id IS NOT NULLAND date_consumed IS NOT NULLAND date_received IS NOT NULL
      ),
      usage_counts AS (SELECTcoupon_id,COUNT(*) AS usage_count,AVG(usage_interval) AS avg_usage_intervalFROM coupon_usageGROUP BY coupon_id
      ),
      total_usage AS (SELECT SUM(usage_count) AS total_usage_count FROM usage_counts
      )
      SELECTu.coupon_id,u.usage_count,ROUND(u.avg_usage_interval, 2) AS avg_usage_interval
      FROM usage_counts u
      CROSS JOIN total_usage t
      WHERE u.usage_count > t.total_usage_count * 0.01;SELECTcoupon_id,usage_count,avg_usage_interval
      FROM coupon_usage_interval_stats
      ORDER BY usage_count DESC, coupon_id
      LIMIT 100;
      
    • 执行结果
      image
  • 优惠券折扣率统计
    • Hive QL代码
      USE default;DROP TABLE IF EXISTS coupon_usage_rate_stats;
      CREATE TABLE coupon_usage_rate_stats AS
      WITH offline_clean AS (SELECTCASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id,CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed,CASE WHEN discount_rate IS NOT NULL AND discount_rate != 'null' THEN discount_rate ELSE NULL END AS discount_rateFROM ccf_offline_stage1_train
      ),
      coupon_totals AS (SELECTcoupon_id,MAX(discount_rate) AS discount_rate,COUNT(*) AS total_count,SUM(CASE WHEN date_consumed IS NOT NULL THEN 1 ELSE 0 END) AS used_countFROM offline_cleanWHERE coupon_id IS NOT NULLGROUP BY coupon_id
      ),
      final_stats AS (SELECTcoupon_id,used_count,total_count,CASE WHEN total_count > 0 THEN used_count / CAST(total_count AS DOUBLE) ELSE 0.0 END AS usage_rate,CASEWHEN discount_rate LIKE '%:%' THEN 'threshold'ELSE 'direct'END AS discount_type,CASEWHEN discount_rate LIKE '%:%' THENCASEWHEN CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE) > 0 THEN(CAST(SPLIT(discount_rate, ':')[1] AS DOUBLE) / CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE)) * 100ELSE NULLENDELSE(1 - CAST(discount_rate AS DOUBLE)) * 100END AS discount_percentageFROM coupon_totals
      )
      SELECTcoupon_id,ROUND(usage_rate, 4) AS usage_rate,used_count,total_count,ROUND(discount_percentage, 2) AS discount_percentage_value,discount_type
      FROM final_stats;SELECTcoupon_id,usage_rate,used_count,CONCAT(CAST(discount_percentage_value AS STRING), '%') AS discount_percentage,discount_type
      FROM coupon_usage_rate_stats
      ORDER BY usage_rate DESC, used_count DESC, coupon_id
      LIMIT 10;
      
    • 执行结果
      image
http://www.hn-smt.com/news/85040/

相关文章:

  • SUV车型轮胎推荐:权威SUV胎专业推荐
  • Less-8 GET-Blind-Boolean Based-Single Quotes - 详解
  • 2025年本田雅阁更换轮胎推荐:专业轮胎选择深度解析
  • P10763 [BalticOI 2024] Tiles
  • P9911 [COCI 2023/2024 #2] Kuglice
  • 2025年水族铝型材推荐厂商TOP5权威评选:口碑好的水族铝
  • 预见未来!兰亭妙微发布2026年用户体验设计三大趋势与企业应对策略
  • 黑马C++ 演讲比赛流程管理系统
  • 重庆度小满贷款客服的双面镜:便利电话背后的信用警示与乡村振兴新实践
  • 2025年工业流体设备企业技术创新TOP5排名——上海易勒机
  • NOI2018 归程 题解
  • 2025年12月APP开发公司权威推荐榜:创新技术与用户体验双轮驱动,精选实力派开发团队深度解析
  • 2025年12月肉粉加工设备厂家推荐:五大品牌深度对比评测榜
  • 2025年辽宁靠谱的代理记账品牌企业排行榜,新测评精选代理记
  • 深入理解MySQL基本原理和架构
  • 轻松上手:使用 Vercel 部署 HTML 页面教程 - 实践
  • 十大三维扫描仪品牌推荐:国产化替代浪潮下的首选型号
  • 2025 年 12 月模胚模架厂家权威推荐榜:精密制造与稳定耐用口碑之选,解析核心优势与选购指南
  • 2025年12月AI智搜GEO服务商权威推荐榜:福州/莆田/三明地区智能搜索推广与排名优化专家深度解析
  • 2025年建筑资质办理服务排名与解析,看看哪家服务专业?
  • DVWA 靶场全通关
  • PbootCMS邮件配置修改发件人信息
  • 2025 年 12 月制氮碳分子筛厂家权威推荐榜:高效吸附与长寿命性能的工业节能之选
  • PbootCMS调用内容中换行符“”不换行怎么办(PbootCMS内容中换行符不生效的解决方法)
  • 如何修改网站文件的发表日期(如何修改网站文章的发表时间)
  • 2025年12月羽毛粉设备厂家推荐:行业权威排行榜单与选购策略分析
  • 2025年度实力不错的桥梁护栏工厂TOP5权威推荐:甄选桥梁
  • 2025年高性价比微生物生长曲线分析仪品牌排名:看看哪家品牌
  • pbootcms判断一个字段为空的时候调用另外一个字段(PbootCMS 判断字段为空时调用其他字段的方法)
  • 实用指南:面向多模态医学图像配准与分割的MambaMorph模型:基于状态空间建模的全流程分析与应用