大数据全系列 教程
1869个小节阅读:467.8k
408考研
JAVA全系列 教程
面向对象的程序设计语言
Python全系列 教程
Python3.x版本,未来主流的版本
人工智能 教程
顺势而为,AI创新未来
大厂算法 教程
算法,程序员自我提升必经之路
C++ 教程
一门通用计算机编程语言
微服务 教程
目前业界流行的框架组合
web前端全系列 教程
通向WEB技术世界的钥匙
大数据全系列 教程
站在云端操控万千数据
AIGC全能工具班
A A
White Night
需求:
根据用户的注册信息表及用户每日登录平台的数据信息来分析用户7日留存情况。
注册信息数据data/demo/registuser.csv
xxxxxxxxxx
uid,regist_day,regist_os
u1,20300401,android
u2,20300401,iphone
u3,20300401,android
u4,20300402,iphone
u5,20300402,android
u6,20300403,iphone
u7,20300403,iphone
u8,20300404,android
u9,20300404,android
登录信息部分数据data/demo/loginInfos.csv
xxxxxxxxxx
uid,login_date
u1,20300401
u1,20300401
u2,20300401
u2,20300401
u3,20300401
u1,20300402
u2,20300402
u3,20300402
u4,20300402
u5,20300402
u1,20300403
SQL函数:
xxxxxxxxxx
package com.itbaizhan.sql.examples
import org.apache.spark.sql.{DataFrame, SparkSession}
object UserKeep7Day {
def main(args: Array[String]): Unit = {
//1.创建SparkSession对象
val spark: SparkSession = SparkSession.builder()
.master("local[*]")
.appName("UserKeep7Day")
.getOrCreate()
//设置日志的级别
spark.sparkContext.setLogLevel("Error")
//3.读取csv文件
val registInfos: DataFrame = spark.read.option("header",true)
.csv("data/demo/registuser.csv")
val loginInfos: DataFrame = spark.read.option("header",true)
.csv("data/demo/loginInfos.csv")
//4.注册临时表
registInfos.createTempView("regist")
loginInfos.createTempView("login_info")
//5.去重 distinct uid,login_date 将同一个用户的同一天登录的数据去重保留一条即可
spark.sql(
"""
|select distinct uid,login_date
|from login_info
|""".stripMargin).createTempView("login")
//使用login和regist两张表进行关联查询
//spark.sql("select datediff('2030-04-04','2030-04-03')").show()
//20300404->时间戳 ->2030-04-04
spark.sql(
"""
|select b.uid,b.regist_day,a.login_date,
|datediff(from_unixtime(unix_timestamp(a.login_date,'yyyyMMdd'),'yyyy-MM-dd'),
| from_unixtime(unix_timestamp(b.regist_day,'yyyyMMdd'),'yyyy-MM-dd')) as diff
|from login a join regist b
|on a.uid = b.uid
|""".stripMargin).createTempView("temp")
//统计注册日期 7日留存情况(不包含注册当天登录的)
spark.sql(
"""
|select regist_day,diff,count(uid) usercount
|from temp
|where diff > 0
|and diff <= 7
|group by regist_day,diff
|order by regist_day,diff
|""".stripMargin).show()
//以上两个sql和合并为一个
spark.sql(
"""
|select regist_day,diff,count(uid) usercount
|from (select b.uid,b.regist_day,a.login_date,
| datediff(from_unixtime(unix_timestamp(a.login_date,'yyyyMMdd'),'yyyy-MM-dd'),
| from_unixtime(unix_timestamp(b.regist_day,'yyyyMMdd'),'yyyy-MM-dd')) as diff
| from login a join regist b
| on a.uid = b.uid)
|where diff > 0
|and diff <= 7
|group by regist_day,diff
|order by regist_day,diff
|""".stripMargin).show()
spark.close()
}
}