Commit 52095d2c by chenyuanjie

Revert "迁移代码"

This reverts commit 84f106a8.
parent 84f106a8
# 用于本地py代码测试
import os
import sys
import time
sys.path.append(os.path.dirname(sys.path[0]))
import paramiko
from utils.common_util import CommonUtil
ssh_host = "hadoop5"
ssh_port = 22
ssh_user = "root"
ssh_pwd = "LrmkEqypH4ZV4S4jA3gq3tSRTNsp2gpjqupLDM5K"
remote_dir = "/tmp/wjc_py/"
remote_py = "/opt/module/anaconda3/envs/pyspark/bin/python"
def put_and_run(local_path, remote_path):
pass
def put_and_run(py_file, args):
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname=ssh_host, port=ssh_port, username=ssh_user, password=ssh_pwd)
print("连接远程服务器成功...")
sftp = client.open_sftp()
file_name = py_file[py_file.rfind("\\") + 1:]
remote_file = f"{remote_dir}{file_name}"
print(f"上传文件【{py_file}】到远程【{remote_file}】中...")
sftp.put(py_file, remote_file)
argstr = " ".join(args)
cmd = """
/opt/module/spark/bin/spark-submit \
--driver-memory 500M \
--executor-memory 500M \
--executor-cores 1 \
--num-executors 3 \
--queue spark \
{py_file} {args}
"""
cmd = cmd.format(py_file=remote_file, args=argstr)
print(f"执行远程命令:【{cmd}】 中...")
stdin, stdout, stderr = client.exec_command(cmd)
print(stdout.read().decode('utf-8'))
client.close()
pass
def submit_remote_run(file_path, args):
"""
测试算法用 使用local模式
:param file_path: 本地文件路径
:param args: 参数
:return:
"""
argstr = " ".join(args)
dir = "/tmp/wjc_py/"
cmd = """
/opt/module/spark/bin/spark-submit \\
--driver-memory 500M \\
--executor-memory 500M \\
--executor-cores 1 \\
--num-executors 3 \\
--queue spark \\
{py_file} {args}
"""
cmd = cmd.format(py_file=dir + file_path, args=argstr)
print("=====================执行远程命令========================")
print(cmd)
print("=====================执行远程命令========================")
fr = os.popen(cmd, "r")
print(fr.read())
fr.close()
def submit_yarn_run(file_path, args):
"""
直接提交spark到yarn
:param file_path: 本地文件路径
:param args: 参数
:return:
"""
arg_str1 = " ".join(args)
arg_str2 = "_".join(args)
dir = "/tmp/wjc_py/"
file_name = file_path[file_path.rfind("/") + 1:file_path.rfind(".")]
log_path = f"/tmp/wjc_java/log/{file_name}_{arg_str2}.log"
py_file = dir + file_path
cmd = f"""
/opt/module/spark/bin/spark-submit \\
--master yarn \\
--driver-memory 2g \\
--executor-memory 10g \\
--executor-cores 4 \\
--num-executors 25 \\
--queue spark \\
{py_file} {arg_str1}
"""
print("=====================执行远程命令========================")
print(cmd)
print("=====================执行远程命令========================")
fr = os.popen(cmd, "r")
print(fr.read())
fr.close()
print("====================日志文件位于========================")
print(log_path)
if __name__ == '__main__':
submit_yarn_run(
file_path="my_kafka/keyword_pcp_listener.py",
args=[]
)
submit_remote_run(
file_path="my_kafka/keyword_pcp_listener.py",
args=[]
)
submit_yarn_run(
file_path="my_kafka/keyword_pcp_listener.py",
args=[]
)
# submit_remote_run(
# file_path="script/test_overwrite_insert.py",
# args=[
# "us",
# "day",
# "2023-01-01"
# ]
# )
# submit_yarn_run(
# file_path="dwd/dwd_st_volume_fba.py",
# args=[
# "us",
# "day",
# "2023-01-01"
# ]
# )
# submit_yarn_run(
# file_path="sparkTest/tmp1.py",
# args=[
# # "us",
# # "last365day",
# # "2023-01",
# "us",
# "day",
# "2023-01-01"
# ]
# )
#
# submit_yarn_run(
# file_path="dim/dim_header_category_bsr.py",
# args=["us"]
# )
# HdfsUtils.delete_hdfs_file("/home/big_data_selection/dwd/dwd_bsr_asin_rank/site_name=us/date_type=last30day/date_info=2023-01-30")
# HdfsUtils.delete_hdfs_file("/home/big_data_selection/dim/dim_bsr_asin_rank")
# submit_yarn_run(
# file_path="dwd/dwt_st_sv_last365.py",
# args=["us", "2023-01-30"]
# )
# for i in range(4):
# submit_remote_run(
# file_path="sparkTest/test_read_hive.py",
# args=["31231", "123q31", "adohahso"]
# )
pass
import os
import sys
import traceback
sys.path.append(os.path.dirname(sys.path[0]))
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from yswg_utils.common_udf import parse_weight_str
class CleanWeight(object):
def __init__(self, site_name='us', year=2023, week=18):
self.site_name = site_name
self.year = year
self.week = week
self.week = f'0{self.week}' if int(self.week) < 10 else f'{self.week}'
# 数据库连接参数
self.db_params = {
"pg_us": {
"host": "192.168.10.216", # 数据库主机地址
"port": 5432, # 数据库端口号
"dbname": "selection" if self.site_name == 'us' else f"selection_{self.site_name}", # 数据库名称
"user": "postgres", # 数据库用户名
"password": "fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS" # 数据库密码
},
"mysql_others": {
"host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com", # 数据库主机地址
"port": 3306, # 数据库端口号
"dbname": "selection" if self.site_name == 'us' else f"selection_{self.site_name}", # 数据库名称
"user": "adv_yswg", # 数据库用户名
"password": "HCL1zcUgQesaaXNLbL37O5KhpSAy0c" # 数据库密码
}
}
self.engine_read, self.engine_save = self.create_connection()
@staticmethod
def get_weight(weight_str, site_name):
# 提取到公共方法中 直接复制的
return parse_weight_str(weight_str, site_name)
def create_connection(self):
# 建立数据库连接
if self.site_name == 'us' and ((int(self.week) >= 18 and int(self.year) >= 2023) or (int(self.year) >= 2024)):
db_params = self.db_params['pg_us']
connection_string = f"postgresql+psycopg2://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"
else:
db_params = self.db_params['mysql_others']
connection_string = f"mysql+pymysql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"
db_params = self.db_params['pg_us']
connection_string_save = f"postgresql+psycopg2://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"
engine_save = create_engine(connection_string_save)
return create_engine(connection_string), engine_save
def read_data(self):
print("开始读取数据")
week_params = f"{int(self.week)}" if self.site_name == 'us' else f"{self.week}"
sql = f"select asin, weight, weight_str from {self.site_name}_asin_detail_{self.year}_{week_params};" # where weight_str is not null
print("sql:", sql)
return pd.read_sql(sql, con=self.engine_read)
def handle_data(self):
df = self.read_data()
if df.shape[0] == 0:
print("site_name, year, week:", self.site_name, self.year, self.week, "数据为空,退出")
print("df.shape:", df.shape)
print("开始处理数据")
df.weight_str = df.weight_str.apply(lambda x: str(x).lower())
# df['weight_info'] = df['weight_str'].apply(self.get_weight)
df['weight_info'] = df.apply(lambda row: self.get_weight(row['weight_str'], self.site_name), axis=1) # 传递多个参数
# df[['weight', 'weight_type']] = df['weight_info'].str.split(',', expand=True)
# tuple 展开
df[['weight', 'weight_type']] = df['weight_info'].apply(pd.Series)
df.weight = df.weight.apply(lambda x: np.nan if str(x) == 'none' else x)
df.weight = df.weight.astype("float64")
df.weight = df.weight.apply(lambda x: 0.001 if x <= 0.001 else x)
df.weight_str = df.weight_str.apply(lambda x: np.nan if str(x) == 'none' else x)
df = df.drop(columns=['weight_info'])
df['date_info'] = f'{self.year}-{self.week}'
return df
def save_data(self):
df = self.handle_data()
print("开始存储数据: 先清空对应week的分区表")
print(df.weight_type.value_counts(dropna=False))
with self.engine_save.begin() as conn:
sql = f"truncate {self.site_name}_asin_weight_{self.year}_{self.week};"
print("清空sql:", sql)
conn.execute(sql)
df.to_sql(f"{self.site_name}_asin_weight_{self.year}_{self.week}", con=self.engine_save, if_exists='append', index=False,
chunksize=df.shape[0] // 10)
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
year = int(sys.argv[2]) # 参数2:类型:day/week/4_week/month/quarter
week = int(sys.argv[3]) # 参数3:年-月-日/年-周/年-月/年-季, 比如: 2022-1
handle_obj = CleanWeight(site_name=site_name, year=year, week=week)
handle_obj.save_data()
quit()
site_name = 'de'
site_name_list = ['us', 'de', 'uk', 'es', 'fr', 'it']
week_list = [16, 17, 18, 19]
year = 2023
week = 19
while True:
try:
for week in week_list:
for site_name in site_name_list:
try:
handle_obj = CleanWeight(site_name=site_name, year=year, week=week)
handle_obj.save_data()
except Exception as e:
print("error_info:", traceback.format_exc(), e)
if site_name_list[-1] == site_name and week_list[-1] == week:
print("不满足运行条件,结束")
quit()
continue
break
except Exception as e:
print("error_info:", traceback.format_exc(), e)
continue
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import col
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("ABA_2023_10_12_export")
sql1 = """
select
date_info,
search_term,
st_bsr_cate_1_id_new as category_id,
market_cycle_type,
is_first_text,
is_ascending_text,
is_high_return_text,
is_search_text,
st_movie_label,
st_brand_label,
bsr_orders,
st_word_num,
st_num,
rank
from dwt_aba_st_analytics
where site_name = 'us'
and date_type = 'month'
and date_info in ('2023-10','2023-11','2023-12');
"""
df_dwt_aba_st_analytics = spark.sql(sql1).cache()
sql2 = """
select
category_id,
en_name
from dim_bsr_category_tree
where site_name = 'us'
and category_parent_id = 0;
"""
df_dim_bsr_category_tree = spark.sql(sql2).cache()
sql3 = """
select
search_term,
rank_change_rate,
rank_rate_of_change,
date_info
from dwt_aba_last_change_rate
where site_name = 'us'
and date_type = 'month'
and date_info in ('2023-10','2023-11','2023-12');
"""
df_dwt_aba_last_change_rate = spark.sql(sql3).cache()
# 过滤出满足条件的词
df_dwt_aba_st_analytics = df_dwt_aba_st_analytics.filter(
"(is_first_text = 1) or (is_ascending_text = 1) or (market_cycle_type in (1, 2))"
)
df_save = df_dwt_aba_st_analytics.join(
df_dim_bsr_category_tree, on='category_id', how='left'
).join(
df_dwt_aba_last_change_rate, on=['date_info', 'search_term'], how='left'
)
df_save = df_save.select(
col('date_info').alias('year_month'),
col('search_term'),
col('en_name').alias('category'),
col('market_cycle_type'),
col('is_first_text'),
col('is_ascending_text'),
col('is_high_return_text'),
col('is_search_text'),
col('st_movie_label').alias('movie_label'),
col('st_brand_label').alias('brand_label'),
col('bsr_orders'),
col('st_word_num').alias('word_counts'),
col('st_num').alias('word_frequency'),
col('rank'),
col('rank_change_rate').alias('year_on_year'),
col('rank_rate_of_change').alias('month_on_month')
)
df_save.repartition(5).show(10, truncate=True)
df_save.write.saveAsTable(name='tmp_aba_2023_export', format='hive', mode='append')
spark.stop()
import os
import re
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql.functions import count, explode, split, udf, lit
from pyspark.sql.types import ArrayType, StringType
if __name__ == '__main__':
date_info = CommonUtil.get_sys_arg(1, None)
spark = SparkUtil.get_spark_session("ABA_2023_10_12_word_frequency")
# 自定义函数,将词组拆分为2个单词为一组
def split_tow_by_tow(search_term):
words = search_term.split()
pairs = []
for i in range(len(words) - 1):
pairs.append(words[i] + ' ' + words[i + 1])
return pairs
u_split_tow_by_tow = udf(split_tow_by_tow, ArrayType(StringType()))
# 自定义函数,将词组拆分为3个单词为一组
def split_three_by_three(search_term):
words = search_term.split()
triplets = []
for i in range(len(words) - 2):
triplets.append(words[i] + ' ' + words[i + 1] + ' ' + words[i + 2])
return triplets
u_split_three_by_three = udf(split_three_by_three, ArrayType(StringType()))
# 自定义函数,剔除掉多余字符
def characters_to_remove(search_term):
pattern = r'\s[^\w\s%\']+?\s'
cleaned_text = re.sub(pattern, ' ', search_term)
cleaned_text = cleaned_text.replace('\n', ' ')
return cleaned_text
u_characters_to_remove = udf(characters_to_remove, StringType())
sql = f"""
select
search_term
from dwt_aba_st_analytics
where site_name = 'us'
and date_type = 'month'
and date_info = '{date_info}';
"""
df_aba = spark.sql(sql).cache()
df_aba = df_aba.select(
u_characters_to_remove(df_aba['search_term']).alias('search_term')
)
df_one_word = df_aba.select(
explode(split(df_aba['search_term'], ' ')).alias('word')
).groupby(
['word']
).agg(
count('word').alias('word_frequency')
).filter(
'word_frequency >= 50'
).withColumn(
'date_info',
lit(f'{date_info}-1')
)
df_tow_word = df_aba.select(
explode(u_split_tow_by_tow(df_aba['search_term'])).alias('word')
).groupby(
['word']
).agg(
count('word').alias('word_frequency')
).filter(
'word_frequency >= 50'
).withColumn(
'date_info',
lit(f'{date_info}-2')
)
df_three_word = df_aba.select(
explode(u_split_three_by_three(df_aba['search_term'])).alias('word')
).groupby(
['word']
).agg(
count('word').alias('word_frequency')
).filter(
'word_frequency >= 50'
).withColumn(
'date_info',
lit(f'{date_info}-3')
)
df_one_word.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
df_tow_word.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
df_three_word.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import count, explode, lit, desc, sum
from pyspark.sql.types import ArrayType, StringType
from textblob import Word
from googletrans import Translator
class ABA2023YearWordFrequency(object):
def __init__(self):
self.spark = SparkUtil.get_spark_session("spark_task: aba_2023_year_word_frequency")
self.df_aba_2023 = self.spark.sql(f"select 1+1;")
self.df_beside_category = self.spark.sql(f"select 1+1;")
self.df_translate = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
self.df_save1 = self.spark.sql(f"select 1+1;")
self.df_save2 = self.spark.sql(f"select 1+1;")
self.df_agg = self.spark.sql(f"select 1+1;")
# 自定义udf
self.u_get_singular_form = self.spark.udf.register('get_singular_form', self.get_singular_form, StringType())
self.u_word_tokenize = self.spark.udf.register('word_tokenize', self.word_tokenize, ArrayType(StringType()))
# self.u_word_translate = self.spark.udf.register('word_translate', self.word_translate, StringType())
@staticmethod
def get_singular_form(word: str):
"""
将单词全部转化为单数形式
"""
if word:
singular_form = Word(word).lemmatize("n")
# word_object = Word(word)
# singular_form = word_object.singularize()
return singular_form
return word
@staticmethod
def word_tokenize(title: str):
"""
分词器
"""
from nltk.tokenize import word_tokenize
result = word_tokenize(title, "english")
return result
# @staticmethod
# def word_translate(word: str):
# if word:
# try:
# translator = Translator()
# result = translator.translate(word, src='en', dest='zh-cn')
# return result.text
# except Exception as e:
# # 处理其他未知错误
# print(f"An unexpected error occurred: {e}")
# return None
# return None
def read_data(self):
sql1 = f"""
select
search_term,
category_id
from dwt_aba_last365
where site_name = 'us'
and date_type = 'last365day'
and date_info = '2023-12';
"""
self.df_aba_2023 = self.spark.sql(sql1).cache()
print("df_aba_2023的数量:")
print(self.df_aba_2023.count())
sql2 = f"""
select
category_id
from dim_bsr_category_tree
where site_name = 'us'
and en_name in ('Audible Books & Originals', 'Books', 'Kindle Store', 'Apps & Games', 'Movies & TV', 'CDs & Vinyl', 'Software', 'Video Games')
and category_parent_id = 0;
"""
self.df_beside_category = self.spark.sql(sql2).cache()
print("df_beside_category的数量:")
print(self.df_beside_category.count())
sql3 = f"""
select
word,
simple_cn as cn
from tmp_en_dict;
"""
self.df_translate = self.spark.sql(sql3).cache()
print("df_translate的数量:")
print(self.df_translate.count())
def handle_data(self):
self.df_save = self.df_aba_2023.join(
self.df_beside_category, on='category_id', how='left_anti'
).select('search_term')
self.df_save = self.df_save.select(explode(self.u_word_tokenize(self.df_save['search_term'])).alias('word'))
self.df_save = self.df_save.groupby(['word']).agg(
count('word').alias('word_frequency')
)
self.df_save = self.df_save.join(
self.df_translate, on='word', how='left'
).withColumn(
'word_singular_form',
self.u_get_singular_form(self.df_save['word'])
).cache()
self.df_save1 = self.df_save.select(
'word', 'word_frequency', 'cn'
).orderBy(
desc('word_frequency')
).withColumn(
'date_info',
lit('2023')
)
print("df_save1的数量:")
print(self.df_save1.count())
self.df_save1.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
print("df_save1存储完成!")
self.df_agg = self.df_save.groupby(['word_singular_form']).agg(
sum('word_frequency').alias('word_frequency')
)
self.df_save2 = self.df_save.select('word', 'cn', 'word_singular_form').join(
self.df_agg, on='word_singular_form', how='left'
).select(
'word', 'word_frequency', 'cn'
).orderBy(
desc('word_frequency')
).withColumn(
'date_info',
lit('2023-merge')
)
print("df_save2的数量:")
print(self.df_save2.count())
self.df_save2.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
print("df_save2存储完成!")
if __name__ == '__main__':
obj = ABA2023YearWordFrequency()
obj.read_data()
obj.handle_data()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
if __name__ == '__main__':
start_date = CommonUtil.get_sys_arg(1, None)
end_date = CommonUtil.get_sys_arg(2, None)
spark_session = SparkUtil.get_spark_session("re-run-aba-month")
sql = f"""
select distinct year_month as date_info from dim_date_20_to_30 where year_month >= '{start_date}' and year_month < '{end_date}';
"""
date_df = spark_session.sql(sql)
print(date_df.show())
date_list = sorted([d.asDict().get("date_info") for d in date_df.collect()])
print(date_list)
for date_info in date_list:
startParams = {
"site_name": "us",
"date_type": "month",
"date_info": date_info
}
print(startParams)
DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='月-重跑ABA四分位',
startParams={
"site_name": "us",
"date_type": "month",
"date_info": date_info
}
)
CommonUtil.send_wx_msg(["huangjian", "chenyuanjie"], "【月-重跑ABA四分位】重跑完成", "")
pass
def asin_to_number(asin):
"""
Convert a 10-character ASIN string to a unique number.
This function assumes that ASIN consists of uppercase letters and digits.
"""
def char_to_number(char):
if char.isdigit():
return int(char)
else:
return ord(char) - 55 # 'A' -> 10, 'B' -> 11, ..., 'Z' -> 35
if len(asin) != 10:
raise ValueError("ASIN must be 10 characters long")
base = 36
asin_number = 0
for i, char in enumerate(reversed(asin)):
asin_number += char_to_number(char) * (base ** i)
# The final number is taken modulo 1 billion to fit the range 1-10 billion
return asin_number % 1000000000
if __name__ == '__main__':
x = asin_to_number('B0CGY4LZQ3')
print(x)
s = f'us_asin_image_part{int(x / 1000_0000) + 1}'
print(s)
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from pyspark.sql.window import Window
from pyspark.storagelevel import StorageLevel
from pyspark.sql import functions as F
class DwtMerchantwordsStDetailMerge(Templates):
def __init__(self, site_name='us'):
super().__init__()
self.site_name = site_name
self.batch = '2024-1'
self.db_save = 'dwt_merchantwords_st_detail_merge'
self.spark = self.create_spark_object(
app_name=f"DwtMerchantwordsStDetailMerge: {self.site_name}, {self.batch}")
self.partitions_num = 15
self.partitions_by = ['site_name', 'batch']
self.df = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwt/{self.db_save}/site_name={self.site_name}/batch={self.batch}"
print(f"清除hdfs目录中.....{hdfs_path}")
HdfsUtils.delete_hdfs_file(hdfs_path)
def read_data(self):
print("读取dwt_merchantwords_st_detail数据")
sql = f"""
select
keyword,
volume,
avg_3m,
avg_12m,
depth,
results_count,
sponsored_ads_count,
page_1_reviews,
appearance,
last_seen,
update_time,
lang,
batch as last_batch
from dwt_merchantwords_st_detail
where site_name = '{self.site_name}'
and batch in ('2023-1', '2024-1');
"""
self.df = self.spark.sql(sqlQuery=sql)
self.df = self.df.repartition(80).persist(StorageLevel.MEMORY_ONLY)
self.df.show(10, truncate=True)
def handle_data(self):
window = Window.partitionBy('keyword').orderBy(
F.desc_nulls_last('last_batch')
)
self.df = self.df.withColumn("u_rank", F.row_number().over(window=window))
self.df = self.df.filter('u_rank=1').drop('u_rank')
self.df_save = self.df.withColumn(
'site_name',
F.lit(self.site_name)
).withColumn(
'batch',
F.lit(self.batch)
)
if __name__ == '__main__':
site_name = sys.argv[1]
handle_obj = DwtMerchantwordsStDetailMerge(site_name=site_name)
handle_obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.hdfs_utils import HdfsUtils
from utils.common_util import CommonUtil
from utils.templates import Templates
from pyspark.sql import functions as F
class FlowAsinLast30days(Templates):
def __init__(self):
super().__init__()
self.db_save = "tmp_flow_asin_last30days"
self.spark = self.create_spark_object(app_name="FlowAsinLast30days")
self.partitions_num = 20
self.partition_dict = {}
self.df_es = self.spark.sql(f"select 1+1;")
self.df_parent = self.spark.sql(f"select 1+1;")
self.df_joined = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
def read_data(self):
self.df_es = self.spark.read.format("org.elasticsearch.spark.sql")\
.option("es.nodes", "192.168.10.217")\
.option("es.port", "9200")\
.option("es.net.http.auth.user", "elastic")\
.option("es.net.http.auth.pass", "selection2021.+")\
.option("es.resource", "us_st_detail_last_4_week")\
.option("es.query", '{"query": {"match_all": {}}}')\
.load()
columns = ["asin", "first_category_rank", "asin_bought_month", "total_comments", "variation_num", "site_name", "account_name"]
self.df_es = self.df_es.select(columns).cache()
self.df_es.show()
sql = f"""
select
asin,
parent_asin
from
ods_asin_variat;
"""
self.df_parent = self.spark.sql(sqlQuery=sql).cache()
def handle_data(self):
# self.df_parent = self.df_parent.groupby(["parent_asin"]).agg(F.count("asin").alias("variation_num"))
self.df_joined = self.df_es.join(self.df_parent, "asin", "left")
self.df_joined = self.df_joined\
.withColumn("parent_asin_is_null", F.when(F.col("parent_asin").isNull(), F.lit(1)).otherwise(F.lit(0)))\
.withColumn("parent_asin_exist", F.when(F.col("parent_asin").isNotNull(), F.lit(1)).otherwise(F.lit(0)))
def save_data(self):
self.df_save = self.df_joined
hdfs_path_asin_info = CommonUtil.build_hdfs_path(self.db_save, partition_dict=self.partition_dict)
print(f"清除hdfs目录中:{hdfs_path_asin_info}")
HdfsUtils.delete_file_in_folder(hdfs_path_asin_info)
print(f"当前存储的表名为:{self.db_save}")
self.df_save.write.saveAsTable(name=self.db_save, format='hive', mode='append')
print("success")
if __name__ == '__main__':
obj = FlowAsinLast30days()
obj.run()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
if __name__ == '__main__':
date_list = ["2024-02","2024-01","2023-12","2023-11","2023-10","2023-09"]
for date_info in date_list:
startParams = {
"site_name": "us",
"date_type": "month",
"date_info": date_info
}
print(startParams)
DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='export_dwt_flow_asin_api',
startParams=startParams
)
CommonUtil.send_wx_msg(["chenyuanjie", "wangrui4"], "【export_dwt_flow_asin_api】导出完成", "")
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
from datetime import datetime
if __name__ == '__main__':
date_info = CommonUtil.get_sys_arg(1, None)
n = CommonUtil.get_sys_arg(2, 0)
hive_tb = "dwt_merchantwords_st_detail"
export_tb = "us_merchantwords_search_term_month_syn_2024"
spark = SparkUtil.get_spark_session(f"export: {hive_tb}")
# 一次导出400w条数据
batch_size = (int(n)-1) * 4000000
start_index = 1 + batch_size
end_index = 4000000 + batch_size
# 构建 URL 的函数
def build_urls(search_term):
url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return urls
# 将Python函数转换为UDF
spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
# 从 PostgreSQL 数据库中读取已有数据
# df_pg = spark.read.format("jdbc") \
# .option("url", "jdbc:postgresql://192.168.10.225:5432/selection") \
# .option("dbtable", export_tb) \
# .option("user", "yswg_postgres") \
# .option("password", "yswg_postgres") \
# .load()
# df_pg = df_pg\
# .select("search_term") \
# .drop_duplicates(["search_term"]) \
# .repartition(70) \
# .cache()
# 从 Hive 表中读取数据
df_hive = spark.sql(f"SELECT keyword FROM {hive_tb}")
df_hive = df_hive\
.withColumn("row_num", row_number().over(Window.orderBy("keyword")))\
.filter(f"row_num BETWEEN {start_index} AND {end_index}")\
.select("keyword")\
.repartition(10) \
.cache()
# 过滤掉keyword含有中文的数据
df_hive = df_hive.filter(~df_hive["keyword"].rlike("[\u4e00-\u9fff]"))
# 过滤掉已存在于目标数据库中的数据
# df_hive = df_hive.join(df_pg, df_hive["keyword"] == df_pg["search_term"], "leftanti")
# 如果没有数据需要导出,退出循环
if df_hive.count() == 0:
print("-------数据已全部导出!-------")
quit()
df_hive = df_hive.selectExpr("keyword AS search_term")
df_hive = df_hive.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
df_hive = df_hive.withColumn("date_info", lit(date_info))
# 导出数据到 PostgreSQL 数据库
df_hive.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.225:5432/selection") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql.functions import row_number, lit, length
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
if __name__ == '__main__':
date_info = CommonUtil.get_sys_arg(1, None)
n = CommonUtil.get_sys_arg(2, 0)
import_tb = "search_term_result_year"
export_tb = "us_merchantwords_search_term_month_syn_2024"
spark = SparkUtil.get_spark_session("MerchantwordsSRToPG16")
# 一次导出400w条数据
batch_size = (int(n)-1) * 4000000
start_index = 1 + batch_size
end_index = 4000000 + batch_size
# 构建 URL 的函数
def build_urls(search_term):
url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return urls
# 将Python函数转换为UDF
spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
# 从SR数据库中读取已有数据
df = spark.read.format("jdbc") \
.option("url", "jdbc:mysql://192.168.10.151:19030/test") \
.option("dbtable", import_tb) \
.option("user", "chenyuanjie") \
.option("password", "chenyuanjie12345") \
.load()
df = df.withColumn(
"row_num",
row_number().over(Window.orderBy("search_term"))
).filter(f"row_num BETWEEN {start_index} AND {end_index}").repartition(20).cache()
# 过滤掉keyword含有中文的数据
df = df.filter(~df["search_term"].rlike("[\u4e00-\u9fff]"))
# 如果没有数据需要导出,退出循环
if df.count() == 0:
print("-------数据已全部导出!-------")
quit()
df = df.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
df = df.filter(length(df['url']) <= 450)
df = df.withColumn("date_info", lit(date_info))
# 导出数据到 PostgreSQL 数据库
df.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.225:5432/selection") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from utils.StarRocksHelper import StarRocksHelper
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("ods_asin_detail_sr_to_hive")
partition_dict = {
"site_name": 'us',
"date_type": 'month',
"date_info": '2024-03'
}
hdfs_path = CommonUtil.build_hdfs_path('ods_asin_detail_test', partition_dict=partition_dict)
HdfsUtils.delete_hdfs_file(hdfs_path)
connection_info = StarRocksHelper.get_connection_info('selection')
df_sr = spark.read.format("starrocks") \
.option("starrocks.fe.http.url", f"{connection_info['ip']}:{connection_info['http_port']}") \
.option("starrocks.fe.jdbc.url", f"jdbc:mysql://{connection_info['ip']}:{connection_info['jdbc_port']}") \
.option("starrocks.table.identifier", "test.ods_asin_detail_test2") \
.option("starrocks.user", connection_info['user']) \
.option("starrocks.password", connection_info['pwd']) \
.option("starrocks.request.tablet.size", "1") \
.option("starrocks.batch.size", "40960") \
.option("starrocks.exec.mem.limit", "21474836480") \
.load()
print("读取完毕")
df_sr.repartition(50)
partitions_by = ['site_name', 'date_type', 'date_info']
df_sr.write.saveAsTable(name='ods_asin_detail_test', format='hive', mode='append', partitionBy=partitions_by)
spark.stop()
# 创建lzo索引和修复元数据
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb='ods_asin_detail_test')
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from utils.StarRocksHelper import StarRocksHelper
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("ods_asin_detail_to_sr_test")
sql = """
select
*
from ods_asin_detail
where site_name = 'us'
and date_type = 'month'
and date_info = '2024-03'
"""
df_hive = spark.sql(sql).repartition(40)
connection_info = StarRocksHelper.get_connection_info('selection')
df_hive.write.format("starrocks") \
.option("starrocks.fe.http.url", f"{connection_info['ip']}:{connection_info['http_port']}") \
.option("starrocks.fe.jdbc.url", f"jdbc:mysql://{connection_info['ip']}:{connection_info['jdbc_port']}") \
.option("starrocks.table.identifier", "test.ods_asin_detail_test") \
.option("starrocks.user", connection_info['user']) \
.option("starrocks.password", connection_info['pwd']) \
.option("starrocks.write.flush.interval.ms", "10000") \
.option("starrocks.write.properties.column_separator", "~!@#$%^&*~!@#$%^&*") \
.mode("append") \
.save()
print("导出完毕")
spark.stop()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
if __name__ == '__main__':
date_list = ['2022-02', '2022-03', '2022-04', '2022-05', '2022-06',
'2022-07', '2022-08', '2022-09', '2022-10', '2022-11', '2022-12']
for date_info in date_list:
print(f"当前执行的分区为:{date_info}")
success_flag = DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='ABA品牌标签调整重跑_api',
startParams={
"site_name": "us",
"date_type": "month",
"date_info": date_info,
"wx_user": "chenyuanjie"
}
)
if success_flag:
continue
else:
CommonUtil.send_wx_msg(["chenyuanjie"], f"ABA品牌标签调整重跑_api {date_info} 执行失败")
break
CommonUtil.send_wx_msg(["chenyuanjie"], "ABA品牌标签调整重跑_api 2022年 执行结束")
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
if __name__ == '__main__':
date_list = ['2023-11', '2023-12']
for date_info in date_list:
print(f"当前执行的分区为:{date_info}")
success_flag = DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='ABA品牌标签调整重跑_api',
startParams={
"site_name": "us",
"date_type": "month",
"date_info": date_info,
"wx_user": "chenyuanjie"
}
)
if success_flag:
continue
else:
CommonUtil.send_wx_msg(["chenyuanjie"], f"ABA品牌标签调整重跑_api {date_info} 执行失败")
break
CommonUtil.send_wx_msg(["chenyuanjie"], "ABA品牌标签调整重跑_api 2023年 执行结束")
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
if __name__ == '__main__':
date_list = ['2024-01', '2024-02', '2024-03', '2024-04',
'2024-05', '2024-06', '2024-07', '2024-08']
for date_info in date_list:
print(f"当前执行的分区为:{date_info}")
success_flag = DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='ABA品牌标签调整重跑_api',
startParams={
"site_name": "us",
"date_type": "month",
"date_info": date_info,
"wx_user": "chenyuanjie"
}
)
if success_flag:
continue
else:
CommonUtil.send_wx_msg(["chenyuanjie"], f"ABA品牌标签调整重跑_api {date_info} 执行失败")
break
CommonUtil.send_wx_msg(["chenyuanjie"], "ABA品牌标签调整重跑_api 2024年 执行结束")
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
from datetime import datetime
if __name__ == '__main__':
date_info = CommonUtil.get_sys_arg(1, None)
year, month, day = date_info.split("-")
table = f"us_merchantwords_brand_analytics_2024_{month}_{day}"
spark = SparkUtil.get_spark_session(f"us_merchantwords_brand_analytics_2024:pg2pg,{date_info}")
df = spark.read.format("jdbc") \
.option("url", "jdbc:postgresql://113.100.143.162:5432/selection") \
.option("dbtable", table) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.load()
df.write.format("jdbc") \
.option("url", "jdbc:postgresql://113.100.143.162:5443/selection") \
.option("dbtable", table) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import lit, col
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
if __name__ == '__main__':
export_tb = "de_merchantwords_search_term_month_syn_2024"
spark = SparkUtil.get_spark_session("MerchantwordsSupplement")
# 构建 URL 的函数
def build_urls(search_term):
url_template = f"https://www.amazon.de/s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return urls
# 将Python函数转换为UDF
spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
sql1 = """
select
keyword,
volume,
st_monthly_sales,
greatest(results_count, asin_total_num) as asin_total_num,
st_sp_counts,
st_zr_counts
from dwt_merchantwords_merge
where site_name = 'de'
and batch = '2024-07-01'
"""
df_dwt_merchantwords_merge = spark.sql(sql1)
# sql2 = """
# select
# keyword
# from dwt_merchantwords_st_detail
# where site_name = 'de'
# and batch = '2024-1'
# """
# df_dwt_merchantwords_st_detail = spark.sql(sql2)
# 产品总数大于80且没有月销
df1 = df_dwt_merchantwords_merge.filter('asin_total_num > 80 and st_monthly_sales <= 0').select('keyword')
print("产品总数大于80且没有月销:" + str(df1.count()))
# 搜索量较大且没有sp广告词
df2 = df_dwt_merchantwords_merge.filter('volume >= 1 and st_sp_counts <= 0').select('keyword')
print("搜索量较大且没有sp广告词:" + str(df2.count()))
# 自然词总数 <= 0的部分
df3 = df_dwt_merchantwords_merge.filter('st_zr_counts <= 0').select('keyword')
print("自然词总数 <= 0的部分:" + str(df3.count()))
# # 过滤掉keyword含有中文的数据
# df_hive = df_hive.filter(~df_hive["keyword"].rlike("[\u4e00-\u9fff]"))
df_save = df1.union(df2).union(df3).drop_duplicates(['keyword'])
df_save = df_save.selectExpr("keyword AS search_term")
df_save = df_save.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
df_save = df_save.withColumn("date_info", lit('2024-06-26'))
# 导出数据到 PostgreSQL 数据库
df_save.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.225:5433/selection_de") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import lit, col
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
if __name__ == '__main__':
export_tb = "us_merchantwords_search_term_month_syn_2024"
spark = SparkUtil.get_spark_session("MerchantwordsSupplement")
# 构建 URL 的函数
def build_urls(search_term):
url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return urls
# 将Python函数转换为UDF
spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
sql1 = """
select
keyword,
volume,
st_zr_counts,
st_sp_counts
from dwt_merchantwords_merge
where site_name = 'us'
and batch = '2024-07-01'
"""
df_dwt_merchantwords_merge = spark.sql(sql1)
# 搜索量较大且没有sp广告词
df1 = df_dwt_merchantwords_merge.filter('volume >= 1 and st_sp_counts <= 0').select('keyword')
print("搜索量较大且没有sp广告词:" + str(df1.count()))
# 自然词总数 <= 0的部分
df2 = df_dwt_merchantwords_merge.filter('st_zr_counts <= 0').select('keyword')
print("自然词总数 <= 0的部分:" + str(df2.count()))
# # 过滤掉keyword含有中文的数据
# df_hive = df_hive.filter(~df_hive["keyword"].rlike("[\u4e00-\u9fff]"))
df_save = df1.union(df2).drop_duplicates(['keyword'])
df_save = df_save.selectExpr("keyword AS search_term")
df_save = df_save.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
df_save = df_save.withColumn("date_info", lit('2024-06-26'))
# 导出数据到 PostgreSQL 数据库
df_save.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.225:5433/selection") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.DorisHelper import DorisHelper
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
if __name__ == '__main__':
spark = SparkUtil.get_spark_session('aba_to_doris_test')
sql = f"""
select *
from dwt_aba_last365
where site_name = 'us'
and date_type = 'month'
and date_info = '2024-10';
"""
df_aba = spark.sql(sql).drop('site_name', 'date_type').cache()
df_aba = df_aba.withColumn(
'date_info', F.concat(F.regexp_replace('date_info', '-', ''), F.lit('01'))
)
df_aba.show(10, True)
columns = df_aba.columns
columns_str = ",".join(columns)
DorisHelper.spark_export_with_columns(df_aba, 'test', 'dwt_aba_last365', columns_str)
print('导出完成')
from openai import OpenAI
api_key = "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"
client = OpenAI(api_key=api_key)
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "Write a haiku about recursion in programming."
}
]
)
print(completion.choices[0].message)
import requests
response = requests.post(
f"https://api.stability.ai/v2beta/stable-image/generate/ultra",
headers={
"authorization": f"sk-f2iOAkResIloOY3yE6xk2LlQbVrtQi3EczZDjA3n9ns7bmeR",
"accept": "image/*"
},
files={"none": ''},
data={
"prompt": "A little cat is in a bedroom with a bed, TV, and sofa",
"output_format": "webp",
},
)
if response.status_code == 200:
with open("./cat01.webp", 'wb') as file:
file.write(response.content)
else:
raise Exception(str(response.json()))
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, BooleanType, MapType
"""
merchantwords 搜索词分词词频
"""
def is_number(str):
"""
判断一个字符是否是数字
:param str:
:return:
"""
import re
return re.match(r"^-?\d+\.?\d+$", str) is not None
def word_tokenize(keyword: str):
import re
keyword = re.sub(r'(\d+\.?\d*|-|\"|,|,|?|\?|/|、|)', '', keyword).strip()
from nltk.tokenize import word_tokenize
result = word_tokenize(keyword, "english")
# 过滤标点如下
filter_arr = [
" ", "\t", "\r", "\n", "(", ")", ",", ",", "[", "]", "、", "-", ":", "&", "|", "+", "``", "'", "'", "\""
]
return list(filter(lambda x: not is_number(x) and x not in filter_arr, result))
def run():
spark = SparkUtil.get_spark_session("app_name")
udf_word_tokenize = F.udf(word_tokenize, ArrayType(StringType()))
keywords_all = spark.sql("select keyword from dwt_merchantwords_st_detail where site_name='us'").cache()
df_all = keywords_all.withColumn("word", F.explode(udf_word_tokenize(F.col("keyword"))))
df_all = df_all.groupby(F.col("word")) \
.agg(F.count("word").alias("frequency")) \
.orderBy(F.col("frequency").desc()) \
.select(
F.col("word"),
F.col("frequency"),
F.lit("us").alias("site_name")
)
hive_tb = 'tmp_word_frequency'
# # 去重
partition_dict = {
"site_name": "us"
}
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict)
HdfsUtils.delete_hdfs_file(hdfs_path)
partition_by = list(partition_dict.keys())
print(f"当前存储的表名为:{hive_tb},分区为{partition_by}", )
df_all.write.saveAsTable(name=hive_tb, format='hive', mode='append', partitionBy=partition_by)
def word_pluralize(keyword: str):
from textblob import Word
# 单数形式
singularize = Word(keyword).singularize().string
# 复数形式
pluralize = Word(singularize).pluralize().string
result = {
"text": keyword,
"singularize": singularize,
"pluralize": pluralize,
"pluralizeFlag": keyword == pluralize,
"not_regular": keyword not in [singularize, pluralize]
}
return result
def word_stem(keyword: str):
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=False)
return stemmer.stem(keyword)
def word_test():
spark = SparkUtil.get_spark_session("word_test")
udf_word_pluralize = F.udf(word_pluralize, StructType(
[
StructField('text', StringType(), True),
StructField('singularize', StringType(), True),
StructField('pluralize', StringType(), True),
StructField('pluralizeFlag', BooleanType(), True),
StructField('not_regular', BooleanType(), True),
]
))
udf_word_stem = F.udf(word_stem, StringType())
keywords_all = spark.sql("select word,frequency from tmp_word_frequency").cache()
keywords_all = keywords_all.withColumn("resultMap", udf_word_pluralize(F.col("word"))).select(
F.col("word"),
F.col("frequency"),
F.col("resultMap").getField("singularize").alias("singularize"),
F.col("resultMap").getField("pluralize").alias("pluralize"),
F.col("resultMap").getField("pluralizeFlag").alias("pluralizeFlag"),
F.col("resultMap").getField("not_regular").alias("not_regular"),
).where("(pluralizeFlag == true) or (not_regular == true)")
# 计算词根
keywords_all = keywords_all.withColumn("word_stem", udf_word_stem(F.col("word")))
keywords_all = keywords_all.withColumn("singularize_stem", udf_word_stem(F.col("singularize")))
keywords_all = keywords_all.withColumn("pluralize_stem", udf_word_stem(F.col("pluralize")))
hive_tb = 'tmp_word_not_regular_v2'
keywords_all.write.saveAsTable(name=hive_tb, format='hive', mode='append')
print("success")
def word_for_download():
spark = SparkUtil.get_spark_session("word_for_calc")
keywords_all = spark.sql("""
select word
from tmp_for_market
order by volume desc
""")
CommonUtil.df_export_csv(spark, keywords_all, csv_name='word_for_calc', limit=200 * 10000)
print("success")
pass
if __name__ == '__main__':
# word_for_calc()
word_for_download()
print("success")
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from pyspark.sql.types import StringType
from utils.templates import Templates
from google.cloud import translate_v2 as translate
class Test(Templates):
def __init__(self):
super().__init__()
self.spark = self.create_spark_object(app_name=f"test")
self.df_st = self.spark.sql(f"select 1+1;")
self.translate_client = translate.Client()
# 自定义udf
self.u_translate_text = self.spark.udf.register('translate_text', self.translate_text, StringType())
def translate_text(self, word: str, target_language='zh'):
result = self.translate_client.translate(word, target_language=target_language)
return result['translatedText']
def read_data(self):
sql1 = f"""
select
search_term
from dwt_aba_last365
where site_name = 'us'
and date_type = 'last365day'
and date_info = '2023-12';
"""
self.df_st = self.spark.sql(sql1).limit(20).cache()
def handle_data(self):
self.df_st = self.df_st.withColumn(
'translate_text',
self.u_translate_text(self.df_st['search_term'])
)
self.df_st.show(20, False)
if __name__ == '__main__':
handle_obj = Test()
handle_obj.run()
import os
import sys
import json
sys.path.append(os.path.dirname(sys.path[0]))
from utils.db_util import DBUtil
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
assert site_name is not None, "site_name 不能为空!"
assert date_type is not None, "date_type 不能为空!"
assert date_info is not None, "date_info 不能为空!"
hive_table = f"dwt_flow_asin"
partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
# 获取计算分区
msg_params = ""
# 解析partition_dict获取分区查询条件
partition_conditions = []
for key, value in partition_dict.items():
if value is not None:
msg_params += f"{value} "
partition_conditions.append(f"{key} = '{value}'")
base_msg = f"{hive_table} {msg_params} "
site_name = partition_dict.get("site_name")
date_type = partition_dict.get("date_type")
spark_session = SparkUtil.get_spark_sessionV3("check_fields_rule")
# 获取维护的字段验证配置表数据
config_table_query = f"""select * from hive_field_verify_config
where table_name ='{hive_table}'
and site_name = '{site_name}'
and use_flag = 1 """
conn_info = DBUtil.get_connection_info('postgresql', 'us')
check_field_df = SparkUtil.read_jdbc_query(
session=spark_session,
url=conn_info["url"],
pwd=conn_info["pwd"],
username=conn_info["username"],
query=config_table_query
)
# 获取验证消息
check_field_list = check_field_df.select('field_name', 'verify_desc', 'verify_type', 'config_json',
'msg_usr_list').collect()
if not check_field_list:
print("============================无验证匹配条件跳过验证===================================")
exit()
# 创建一个df用于储存验证情况
# 定义列的结构
schema = StructType([
StructField("验证描述", StringType(), True),
StructField("验证类型", StringType(), True),
StructField("校验字段", StringType(), True),
StructField("校验条件查询占比", StringType(), True),
StructField("验证占比临界值上限", StringType(), True),
StructField("验证占比临界值下限", StringType(), True),
StructField("是否验证通过", IntegerType(), True),
])
# 使用定义的结构创建空的 DataFrame
check_df = spark_session.createDataFrame([], schema)
# 进行验证sql组装
query = f"""
SELECT COUNT(1) AS total_count
FROM {hive_table}
"""
# 拼接where条件
if partition_conditions:
query_total = query + f" WHERE {' AND '.join(partition_conditions)}"
# 执行sql获取验证值与df
total_df = spark_session.sql(query_total).cache()
total_count = int(total_df.collect()[0]['total_count'])
for row in check_field_list:
vertify_flag = True
field_name = row['field_name']
verify_type = row['verify_type']
config_json = json.loads(row['config_json'])
msg_usr = row['msg_usr_list']
msg_usr_list = [user.strip() for user in msg_usr.split(",")] if msg_usr else []
sql_condition = config_json['sql_condition']
partition_conf_list = config_json['partition_conf']
for conf in partition_conf_list:
conf_site_name = conf["site_name"]
conf_date_type = conf["date_type"]
if site_name == conf_site_name and date_type == conf_date_type:
vertify_flag = True
break
else:
vertify_flag = False
# 没有合适的匹配维度
if not vertify_flag:
break
# 拼接外部查询条件
if sql_condition:
query_field_check = query_total + f" AND {sql_condition} "
check_count_df = spark_session.sql(query_field_check).cache()
check_count = int(check_count_df.collect()[0]['total_count'])
calcult_rate = round((check_count / total_count), 3)
waring_max = conf['max_rate']
waring_min = conf['min_rate']
verify_flag = 1 if (calcult_rate <= waring_max) and (calcult_rate >= waring_min) else 0
ratio_df = spark_session.createDataFrame([(row['verify_desc'],verify_type,field_name,calcult_rate,waring_max,waring_min,verify_flag)],schema).repartition(1)
check_df = check_df.unionByName(ratio_df, False)
if check_df.count() < 1 :
print("无验证项验证")
exit()
check_df.show(50, truncate=False)
# 对校验结果进行判断是否有校验不通过的数据
schema_flag = bool(check_df.select(F.min("是否验证通过").alias("result")).first().asDict()['result'])
if not schema_flag:
msg = f"数据表:{hive_table} {msg_params},计算数据存在验证不通过,请检查数据是否异常!!具体信息请查看日志!!"
CommonUtil.send_wx_msg(['chenjianyun'], f"\u26A0 {hive_table} {msg_params}流程数据导出前验证异常", msg)
spark_session.stop()
pass
\ No newline at end of file
def word_tokenize(title: str):
"""
分词器
"""
from nltk.tokenize import word_tokenize
result = word_tokenize(title, "english")
return result
if __name__ == '__main__':
aba = "nation's bravest tales of courage and heroism"
print(word_tokenize(aba))
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
if __name__ == '__main__':
export_tb = "de_brand_analytics_month"
spark = SparkUtil.get_spark_session("update_de_brand_analytics_month_2024_05")
sql1 = """
select
search_term
from ods_st_quantity_being_sold
where site_name = 'de'
and date_type = 'month'
and date_info = '2024-05'
and quantity_being_sold in (16, 48)
"""
df_aba = spark.sql(sql1)
sql2 = """
select
search_term,
quantity_being_sold
from dwt_merchantwords_merge
where site_name = 'de'
"""
df_me = spark.sql(sql2)
df_save = df_aba.join(
df_me, on='search_term', how='inner'
)
# 导出数据到 PostgreSQL 数据库
df_save.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.223:5433/selection_de") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from pyspark.sql.functions import row_number
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("UpdateMerchantwords")
hive_tb = 'dwd_merchantwords_measure'
partition_dict = {
"site_name": 'us',
"batch": '2023-01'
}
sql1 = f"""
select
keyword,
lang,
st_ao_val,
st_zr_flow_proportion,
min_bid,
max_bid,
suggested_bid,
volume,
avg_3m,
avg_12m,
asin_total_num,
asin_num,
self_asin_num,
self_asin_proportion,
st_sp_counts,
st_zr_counts,
st_monthly_sales,
listing_sales_avg,
reviews_avg,
rating_avg,
price_avg,
depth
from dwd_merchantwords_measure
where site_name = 'us'
and batch = '2023-01';
"""
df_dwd = spark.sql(sqlQuery=sql1).cache()
df_dwd.repartition(80)
sql2 = f"""
select
keyword,
results_count,
sponsored_ads_count,
page_1_reviews,
appearance,
last_seen,
update_time
from dwt_merchantwords_st_detail
where site_name = 'us'
and batch = '2023-1';
"""
df_merchantwords_detail = spark.sql(sqlQuery=sql2)
df_merchantwords_detail = df_merchantwords_detail \
.withColumn("row_num", row_number().over(Window.orderBy("keyword"))) \
.filter("row_num BETWEEN 1 AND 12000000") \
.repartition(80) \
.drop("row_num") \
.cache()
df = df_dwd.join(df_merchantwords_detail, 'keyword', 'left')
df = df.withColumn(
'site_name',
F.lit('us')
).withColumn(
'batch',
F.lit('2023-01')
)
CommonUtil.save_or_update_table(spark_session=spark,
hive_tb_name=hive_tb,
partition_dict=partition_dict,
df_save=df,
drop_exist_tmp_flag=True)
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import count, col
class WordFrequency(object):
def __init__(self):
self.spark = SparkUtil.get_spark_session("us_aba_last365_word_frequency")
def run(self):
sql1 = f"""
select search_term, date_info
from dwt_aba_st_analytics
where site_name = 'us'
and date_type = 'month'
and date_info in
('2024-10', '2024-09', '2024-08', '2024-07', '2024-06', '2024-05',
'2024-04', '2024-03', '2024-02', '2024-01', '2023-12', '2023-11')
and rank <= 1000000
and st_brand_label = 1;
"""
df_st = self.spark.sql(sql1).cache()
print("df_st数量是:")
print(df_st.count())
sql2 = f"""
select search_term, first_match_brand as brand, date_info
from dws_st_brand_info
where site_name = 'us'
and date_type = 'month'
and date_info in
('2024-10', '2024-09', '2024-08', '2024-07', '2024-06', '2024-05',
'2024-04', '2024-03', '2024-02', '2024-01', '2023-12', '2023-11')
and st_brand_label = 1;
"""
df_brand = self.spark.sql(sql2).cache()
print("df_brand数量是:")
print(df_brand.count())
df_save = df_st.join(
df_brand, on=['date_info', 'search_term'], how='left'
).drop('date_info')
print("df_save数量是:")
print(df_save.count())
df_save = df_save.groupby(['brand']).agg(
count('brand').alias('frequency')
).orderBy('frequency', ascending=False)
df_save.show(20, False)
df_save = df_save.withColumn("frequency", col("frequency").cast("int"))
total_sum = df_save.select("frequency").groupBy().sum().collect()[0][0]
if total_sum == df_st.count():
print('验证成功')
else:
print('验证失败')
output_path = "hdfs:///user/chenyuanjie/test1/"
df_save.write.mode("overwrite").format("csv").option("delimiter", "^").option("lineSep", "\n").option("header", "false").option("compression", "none").save(output_path)
if __name__ == '__main__':
obj = WordFrequency()
obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__':
hive_tb = "tmp_us_st_keepa_syn_2024"
hdfs_path = "/home/big_data_selection/tmp/tmp_us_st_keepa_syn_2024"
print(f"hdfs_path is {hdfs_path}")
query = f"""
select
asin
from us_st_keepa_syn_2024
where 1 = 1
and \$CONDITIONS
"""
db_type = "postgresql"
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type,
site_name='us',
query=query,
hive_tb_name=hive_tb,
msg_usr=['chenyuanjie']
)
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if not empty_flag:
sh = CommonUtil.build_import_sh(site_name='us',
db_type=db_type,
query=query,
hdfs_path=hdfs_path)
# 导入前先删除
HdfsUtils.delete_hdfs_file(hdfs_path)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
client.close()
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import col, lit
from utils.StarRocksHelper import StarRocksHelper
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("us_st_keepa_syn_2024_export")
# 从SR数据库中读取已有数据
sql = """
select distinct asin from selection.us_asin_latest_detail where date_info = '2024-06'
"""
df_sr = StarRocksHelper.spark_import_with_sql(spark, sql).repartition(80, 'asin').cache()
print("starrocks读取:")
df_sr.show(10)
sql = """
select asin from tmp_us_st_keepa_syn_2024;
"""
df_pg = spark.sql(sql).drop_duplicates(['asin']).repartition(80, 'asin').cache()
print("pg读取:")
df_pg.show(10)
df = df_sr.subtract(df_pg)
df_sr.unpersist()
df_pg.unpersist()
df = df.withColumn(
'state',
lit(7)
).withColumn(
'asin_trun_4',
col('asin').substr(1, 4)
)
df.show(10)
print(df.count())
df.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.224:5433/selection") \
.option("dbtable", "us_st_keepa_syn_2024") \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
import os
import sys
from sqlalchemy.dialects.postgresql import pypostgresql
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from utils.db_util import DBUtil
from utils.StarRocksHelper import StarRocksHelper
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("us_st_keepa_syn_2024_export")
# 从SR数据库中读取已有数据
sql = """
select distinct asin from selection.us_asin_latest_detail where date_info = '2024-06' and (asin_launch_time>'2024-07-19' or asin_launch_time<'1990-01-01')
"""
df_sr = StarRocksHelper.spark_import_with_sql(spark, sql).repartition(80, 'asin').cache()
print("starrocks读取:")
df_sr.show(10)
sql = """
select asin from tmp_us_st_keepa_syn_2024;
"""
df_pg = spark.sql(sql).drop_duplicates(['asin']).repartition(80, 'asin').cache()
print("pg读取:")
df_pg.show(10)
df = df_sr.subtract(df_pg)
print(df.count())
df_sr.unpersist()
df_pg.unpersist()
update_asin = df.select("asin").rdd.map(lambda row: row[0]).collect()
print(update_asin)
pg_engine = DBUtil.get_db_engine('postgresql', 'us')
with pg_engine.begin() as conn:
update_query = f"""
UPDATE us_st_keepa_syn_2024 SET state = 5 WHERE asin IN {tuple(update_asin)}
"""
conn.execute(update_query)
spark.stop()
import os
import sys
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, StringType
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
class VerifyRank(object):
def __init__(self):
self.spark = SparkUtil.get_spark_session("{self.__class__.__name__}")
def run(self):
sql = f"""
select
search_term,
rank,
date_info
from ods_brand_analytics
where site_name = 'us'
and date_type = 'week'
and date_info >= '2024-01'
and rank < 100000
"""
df_all = self.spark.sql(sql).repartition(40, 'search_term').cache()
def leave_one_out_means(structs):
ranks = [x['rank'] for x in structs]
date_infos = [x['date_info'] for x in structs]
total_sum = sum(ranks)
n = len(ranks)
if n > 1:
means = [round((total_sum - rank) / (n - 1), 2) for rank in ranks]
else:
means = [ranks[0]]
result = [{"means": mean, "date_info": date_info} for mean, date_info in zip(means, date_infos)]
return result
leave_one_out_means_udf = F.udf(leave_one_out_means, ArrayType(StructType([
StructField("means", FloatType(), True),
StructField("date_info", StringType(), True)
])))
df_agg = df_all.groupBy("search_term").agg(
F.collect_list(F.struct("rank", "date_info")).alias("collect_row")
# F.collect_list("rank").alias("values")
)
df_agg = df_agg.withColumn(
"collect_row", leave_one_out_means_udf(F.col("collect_row"))
)
def calc_quantiles(structs):
values = [x['means'] for x in structs]
values = sorted(values) # 将组内的数值进行排序
n = len(values)
# 计算 Q1 和 Q3 的位置(基于 25% 和 75% 的位置)
q1_index = int(n * 0.25)
q3_index = int(n * 0.75)
if n > 1:
q1 = values[q1_index]
q3 = values[q3_index]
else:
q1 = values[0]
q3 = values[0]
return [float(q1), float(q3)]
quantile_udf = F.udf(calc_quantiles, ArrayType(FloatType()))
df_agg = df_agg.withColumn(
"quantiles", quantile_udf(F.col("collect_row"))
).withColumn(
"q1", F.col("quantiles")[0]
).withColumn(
"q3", F.col("quantiles")[1]
).withColumn(
"iqr", F.expr("q3 - q1")
).withColumn(
"lower_bound", F.expr("q1 - 100 * iqr")
).withColumn(
"upper_bound", F.expr("q3 + 100 * iqr")
).select(
'search_term', 'collect_row', 'lower_bound', 'upper_bound'
).repartition(40, 'search_term')
df_save = df_agg.withColumn(
"filtered_collect_row",
F.filter(
"collect_row",
lambda x: (x["means"] < F.col("lower_bound")) | (x["means"] > F.col("upper_bound"))
)
).filter(
F.size(F.col("filtered_collect_row")) > 0
).withColumn(
"has_2024_08",
F.exists(
"filtered_collect_row",
lambda x: x["date_info"].like("2024-08%")
)
).filter(
~F.col("has_2024_08") # 过滤掉包含 '2024-08' 的行
).select(
'search_term', 'filtered_collect_row', 'lower_bound', 'upper_bound'
)
df_save.show(20, False)
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
obj = VerifyRank()
obj.run()
import json
from utils.datahub_util import build_column_lineages
to_tb = "dim_asin_label"
sources = [
{
"form": "ods_other_search_term_data",
"mappings": [
("asin", "asin"),
("label", "asin_label_list"),
("label", "asin_label_type"),
]
}
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
import json
from utils.datahub_util import build_column_lineages
to_tb = "dim_asin_detail"
sources = [
{
"form": "ods_asin_detail",
"mappings": [
("asin", "asin"),
("img_url", "asin_img_url"),
("title", "asin_title"),
("title_len", "asin_title_len"),
("price", "asin_price"),
("rating", "asin_rating"),
("total_comments", "asin_total_comments"),
("buy_box_seller_type", "asin_buy_box_seller_type"),
("page_inventory", "asin_page_inventory"),
("category", "asin_category_desc"),
("volume", "asin_volume"),
("rank", "asin_rank"),
("launch_time", "asin_launch_time"),
("img_num", "asin_img_num"),
("img_type", "asin_img_type"),
("category_state", "asin_category_state"),
("material", "asin_material"),
("brand", "asin_brand_name"),
("activity_type", "asin_activity_type"),
("one_two_val", "act_one_two_val"),
("three_four_val", "act_three_four_val"),
("five_six_val", "act_five_six_val"),
("eight_val", "act_eight_val"),
("qa_num", "qa_num"),
("one_star", "one_star"),
("two_star", "two_star"),
("three_star", "three_star"),
("four_star", "four_star"),
("five_star", "five_star"),
("low_star", "low_star"),
("together_asin", "together_asin"),
("ac_name", "ac_name"),
("node_id", "node_id"),
("data_type", "asin_data_type"),
("sp_num", "sp_num"),
("describe", "asin_describe"),
("package_quantity", "asin_package_quantity"),
("pattern_name", "asin_pattern_name"),
]
},
{
"form": "dim_asin_variation_info",
"mappings": [
("parent_asin", "parent_asin"),
("color", "asin_color"),
("size", "asin_size"),
("style", "asin_style"),
("state", "asin_is_sale"),
]
},
{
"form": "ods_asin_keep_date",
"mappings": [
("launch_time", "asin_launch_time"),
]
},
{
"form": "ods_other_search_term_data",
"mappings": [
("label", "asin_label_list"),
]
},
{
"form": "ods_other_search_term_data",
"mappings": [
("weight_str", "asin_weight_str"),
("weight", "asin_weight"),
("weight_type", "asin_weight_type"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
import json
from utils.datahub_util import build_column_lineages
to_tb = "dim_brand_info"
sources = [
{
"form": "dim_st_detail",
"mappings": [
("search_term", "search_term"),
]
},
{
"form": "dwd_st_asin_measure",
"mappings": [
("search_term", "search_term"),
]
},
{
"form": "dim_asin_detail",
"mappings": [
("asin_brand_name", "asin_brand_name"),
]
}
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
from utils.datahub_util import build_column_lineages
to_tb = "dim_bsr_asin_rank_history"
sources = [
{
"form": "ods_bs_category_top100_asin",
"mappings": [
("asin", "asin"),
("cate_current_id", "old_category_id"),
("category_id", "category_id"),
("bsr_rank", "bsr_rank"),
("rating", "asin_rating"),
("total_comments", "asin_total_comments"),
("updated_at", "updated_at"),
("date_info", "date_info"),
("site_name", "site_name"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
import json
from utils.datahub_util import build_column_lineages
to_tb = "dim_cal_asin_history_detail"
sources = [
{
"form": "dim_asin_detail",
"mappings": [
("asin", "asin"),
("asin_title", "asin_title"),
("asin_title_len", "asin_title_len"),
("asin_category_desc", "asin_category_desc"),
("asin_rank", "asin_rank"),
("asin_volume", "asin_volume"),
("asin_weight", "asin_weight"),
("asin_color", "asin_color"),
("asin_size", "asin_size"),
("asin_style", "asin_style"),
("asin_price", "asin_price"),
("asin_rating", "asin_rating"),
("asin_total_comments", "asin_total_comments"),
("asin_material", "asin_material"),
("asin_brand_name", "asin_brand_name"),
("asin_page_inventory", "asin_page_inventory"),
("bsr_cate_1_id", "bsr_cate_1_id"),
("bsr_cate_current_id", "bsr_cate_current_id"),
("asin_buy_box_seller_type", "asin_buy_box_seller_type"),
("asin_img_url", "asin_img_url"),
("updated_time", "asin_crawl_date"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
import json
from utils.datahub_util import build_column_lineages
to_tb = "dim_fd_asin_info"
sources = [
{
"form": "ods_seller_account_syn",
"mappings": [
("id", "fd_account_id"),
("account_name", "fd_account_name"),
]
},
{
"form": "ods_seller_asin_account",
"mappings": [
("account_name", "fd_account_name"),
("asin", "asin"),
]
},
{
"form": "ods_seller_account_feedback",
"mappings": [
("country_name", "fd_country_name"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
from utils.datahub_util import build_column_lineages
to_tb = "dim_nsr_asin_rank_history"
sources = [
{
"form": "ods_new_releases_top100_asin",
"mappings": [
("asin", "asin"),
("cate_current_id", "old_category_id"),
("category_id", "category_id"),
("bsr_rank", "bsr_rank"),
("rating", "asin_rating"),
("total_comments", "asin_total_comments"),
("updated_at", "updated_at"),
("date_info", "date_info"),
("site_name", "site_name"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
from utils.datahub_util import build_column_lineages
to_tb = "dwd_bsr_asin_rank"
sources = [
{
"form": "dim_bsr_asin_rank_history",
"mappings": [
("asin", "asin"),
("category_id", "category_id"),
("bsr_rank", "bsr_rank"),
# ("", "is_1_day_flag"),
# ("", "is_7_day_flag"),
# ("", "is_30_day_flag"),
# ("", "bsr_count"),
# ("", "is_asin_new"),
# ("", "is_asin_bsr_new"),
# ("", "last_bsr_day"),
("site_name", "site_name"),
("date_info", "date_info"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
from utils.datahub_util import build_column_lineages
to_tb = "dwd_nsr_asin_rank"
sources = [
{
"form": "dim_nsr_asin_rank_history",
"mappings": [
("asin", "asin"),
("category_id", "category_id"),
("bsr_rank", "bsr_rank"),
# ("", "is_1_day_flag"),
# ("", "is_7_day_flag"),
# ("", "is_30_day_flag"),
# ("", "bsr_count"),
# ("", "is_asin_new"),
# ("", "is_asin_bsr_new"),
# ("", "last_bsr_day"),
("site_name", "site_name"),
("date_info", "date_info"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
import json
from utils.datahub_util import build_column_lineages
to_tb = "dws_st_brand_info"
sources = [
{
"form": "dim_st_detail",
"mappings": [
("search_term", "search_term"),
]
},
{
"form": "dim_brand_info",
"mappings": [
("brand_name", "first_match_brand"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
import json
from utils.datahub_util import build_column_lineages
to_tb = "dwt_aba_last365"
sources = [
{
"form": "dwt_aba_st_analytics",
"mappings": [
("id", "id"),
("search_term", "search_term"),
("category_id", "category_id"),
("st_num", "st_num1"),
("st_num", "st_num2"),
("st_num", "st_num3"),
("st_num", "st_num4"),
("st_num", "st_num5"),
("st_num", "st_num6"),
("st_num", "st_num7"),
("st_num", "st_num8"),
("st_num", "st_num9"),
("st_num", "st_num10"),
("st_num", "st_num11"),
("st_num", "st_num12"),
("form", "total_st_num"),
("bsr_orders", "bsr_orders1"),
("bsr_orders", "bsr_orders2"),
("bsr_orders", "bsr_orders3"),
("bsr_orders", "bsr_orders4"),
("bsr_orders", "bsr_orders5"),
("bsr_orders", "bsr_orders6"),
("bsr_orders", "bsr_orders7"),
("bsr_orders", "bsr_orders8"),
("bsr_orders", "bsr_orders9"),
("bsr_orders", "bsr_orders10"),
("bsr_orders", "bsr_orders11"),
("bsr_orders", "bsr_orders12"),
("market_cycle_type", "market_cycle_type1"),
("market_cycle_type", "market_cycle_type2"),
("market_cycle_type", "market_cycle_type3"),
("market_cycle_type", "market_cycle_type4"),
("market_cycle_type", "market_cycle_type5"),
("market_cycle_type", "market_cycle_type6"),
("market_cycle_type", "market_cycle_type7"),
("market_cycle_type", "market_cycle_type8"),
("market_cycle_type", "market_cycle_type9"),
("market_cycle_type", "market_cycle_type10"),
("market_cycle_type", "market_cycle_type11"),
("market_cycle_type", "market_cycle_type12"),
("search_volume", "search_volume1"),
("search_volume", "search_volume2"),
("search_volume", "search_volume3"),
("search_volume", "search_volume4"),
("search_volume", "search_volume5"),
("search_volume", "search_volume6"),
("search_volume", "search_volume7"),
("search_volume", "search_volume8"),
("search_volume", "search_volume9"),
("search_volume", "search_volume10"),
("search_volume", "search_volume11"),
("search_volume", "search_volume12"),
("st_ao_avg", "st_ao_avg"),
("st_ao_val_rate", "st_ao_val_rate"),
("supply_demand", "supply_demand"),
("price_avg", "price_avg"),
("total_comments_avg", "total_comments_avg"),
("rating_avg", "rating_avg"),
("weight_avg", "weight_avg"),
("volume_avg", "volume_avg"),
("aadd_proportion", "aadd_proportion"),
("sp_proportion", "sp_proportion"),
("fbm_proportion", "fbm_proportion"),
("cn_proportion", "cn_proportion"),
("amzon_proportion", "amzon_proportion"),
("top3_seller_orders", "top3_seller_orders"),
("top3_seller_bsr_orders", "top3_seller_bsr_orders"),
("top3_brand_orders", "top3_brand_orders"),
("top3_brand_bsr_orders", "top3_brand_bsr_orders"),
("page3_brand_num", "page3_brand_num"),
("page3_seller_num", "page3_seller_num"),
("max_num", "max_num"),
("most_proportion", "most_avg_proportion"),
("new_asin_proportion", "new_asin_num_avg_monopoly"),
("new_bsr_orders_proportion", "new_asin_bsr_orders_avg_monopoly"),
("total_asin_num", "total_asin_num"),
# ("form", "orders"),
("bsr_orders", "bsr_orders"),
("max_num_asin", "max_num_asin"),
("is_self_max_num_asin", "is_self_max_num_asin"),
("gross_profit_fee_sea", "gross_profit_fee_sea"),
("gross_profit_fee_air", "gross_profit_fee_air"),
("category_current_id", "category_current_id"),
("color_proportion", "color_proportion"),
("brand_monopoly", "brand_monopoly"),
("seller_monopoly", "seller_monopoly"),
# ("form", "top_rank"),
# ("form", "orders1"),
# ("form", "orders2"),
# ("form", "orders3"),
# ("form", "orders4"),
# ("form", "orders5"),
# ("form", "orders6"),
# ("form", "orders7"),
# ("form", "orders8"),
# ("form", "orders9"),
# ("form", "orders10"),
# ("form", "orders11"),
# ("form", "orders12"),
# ("form", "max_orders_month"),
# ("form", "max_bsr_orders_month"),
("multi_color_proportion", "multi_color_avg_proportion"),
("multi_size_proportion", "multi_size_avg_proportion"),
# ("form", "q1_bsr_orders"),
# ("form", "q2_bsr_orders"),
# ("form", "q3_bsr_orders"),
# ("form", "q4_bsr_orders"),
# ("form", "q1_orders"),
# ("form", "q2_orders"),
# ("form", "q3_orders"),
# ("form", "q4_orders"),
# ("form", "is_new_market_segment"),
# ("form", "is_first_text"),
# ("form", "is_ascending_text"),
# ("form", "is_search_text"),
("st_word_num", "st_word_num"),
# ("form", "site_name"),
# ("form", "date_type"),
# ("form", "date_info"),
]
},
{
"form": "dwt_st_sv_last365",
"mappings": [
("sv_rank", "rank"),
]
},
{
"form": "dim_st_detail",
"mappings": [
("st_search_sum", "orders"),
]
}
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
import json
from utils.datahub_util import build_column_lineages
to_tb = "dwt_nsr_asin_detail"
sources = [
{
"form": "dwd_nsr_asin_rank",
"mappings": [
("asin", "asin"),
]
},
{
"form": "dim_fd_asin_info",
"mappings": [
("fd_account_name", "account_name"),
]
},
{
"form": "dim_cal_asin_history_detail",
"mappings": [
("asin_title", "title"),
("asin_img_url", "img_url"),
("asin_img_type", "img_type"),
("asin_rating", "rating"),
("asin_total_comments", "total_comments"),
("asin_price", "price"),
("asin_weight", "weight"),
("asin_launch_time", "launch_time"),
("asin_volume", "volume"),
("asin_brand_name", "brand_name"),
("asin_buy_box_seller_type", "buy_box_seller_type"),
("asin_crawl_date", "last_update_time")
]
},
{
"form": "dwt_flow_asin",
"mappings": [
("asin_bsr_orders_change", "bsr_orders_change"),
("asin_ao_val", "ao_val"),
("bsr_orders", "bsr_orders"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
\ No newline at end of file
import json
from utils.datahub_util import build_column_lineages
to_tb = "dwt_bsr_asin_detail"
sources = [
{
"form": "dwd_bsr_asin_rank",
"mappings": [
("asin", "asin"),
]
},
{
"form": "dim_fd_asin_info",
"mappings": [
("fd_account_name", "account_name"),
]
},
{
"form": "dim_cal_asin_history_detail",
"mappings": [
("asin_title", "title"),
("asin_img_url", "img_url"),
("asin_img_type", "img_type"),
("asin_rating", "rating"),
("asin_total_comments", "total_comments"),
("asin_price", "price"),
("asin_weight", "weight"),
("asin_launch_time", "launch_time"),
("asin_volume", "volume"),
("asin_brand_name", "brand_name"),
("asin_buy_box_seller_type", "buy_box_seller_type"),
("asin_crawl_date", "last_update_time")
]
},
{
"form": "dwt_flow_asin",
"mappings": [
("asin_bsr_orders_change", "bsr_orders_change"),
("asin_ao_val", "ao_val"),
("bsr_orders", "bsr_orders"),
]
},
]
if __name__ == '__main__':
config = []
for source in sources:
for mapping in source["mappings"]:
config.append(
{
"from": f"{source['form']}.{mapping[0]}",
"to": f"{to_tb}.{mapping[1]}",
},
)
pass
url = build_column_lineages(config)
print(url)
pass
pass
#! /bin/env bash
# author: 方星钧(ffman)
# describe: dwt_aba_4_week-->站点_last_4_week_aba_copy1
# params: 参数1:site_name;
# version: 1.0
# create_date: 2022-5-20
# update_date: 2022-5-20
# sqoop执行变量
sqoop=/opt/module/sqoop/bin/sqoop
# 定义脚本传入的变量
# 根据site_name来判断数据库名称
if [ $1 == all ];
then
site_name_array=(us uk de es fr it)
else
site_name_array=($1)
fi
echo "site_name_array: ${site_name_array}"
export_data () {
$sqoop export -D mapred.job.queue.name=spark \
--connect "jdbc:mysql://rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com:3306/${db}?useUnicode=true&characterEncoding=utf-8" \
--username adv_yswg \
--password HmRCMUjt03M33Lze \
--table ${mysql_table} \
--input-fields-terminated-by '\001' \
--hcatalog-database big_data_selection \
--hcatalog-table ${hive_table} \
--hcatalog-partition-keys site_name \
--hcatalog-partition-values ${site_name} \
--input-null-string '\\N' \
--input-null-non-string '\\N' \
--columns ${cols} \
--num-mappers 3
}
for site_name in ${site_name_array[*]}
do
if [ $site_name == us ];
then
db=selection
else
db=selection_$site_name
fi
echo "db: ${db}"
mysql_table=${site_name}_last_4_week_aba_copy1
hive_table=dwt_aba_4_week
cols="search_term,st_brand_id,st_rank,st_quantity_being_sold,st_search_num,st_is_first_text,st_ao_val,st_asin_orders_sum,st_asin1,st_click_share1,st_conversion_share1,st_asin2,st_click_share2,st_conversion_share2,st_asin3,st_click_share3,st_conversion_share3,week,year"
echo "当前导出的hive表:${hive_table}, mysql表: ${mysql_table}"
export_data
done
import os
import sys
import pandas as pd
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.utils import Utils
class ExportDimStYearWeek(Utils):
def __init__(self, site_name='us'):
"""
默认导入所有站点的所有data_type类型的表
"""
super(ExportDimStYearWeek, self).__init__()
self.site_name = site_name
self.connection(db_type="mysql", db_conn="mysql_aliyun") # mysql连接
self.year = year
self.week = week
self.year_week = f"{self.year}-{self.week}"
self.path_sh = f"/opt/module/spark/demo/py_demo/demo_sqoop/export_dim_st_year_week.sh"
print(f"self.site_name: {self.site_name}, self.year_week: {self.year_week}")
def export_data(self):
print("开始导出数据")
print("1. 建表")
with self.engine.begin() as conn:
sql_drop = f"drop table if exists {self.site_name}_st_year_week_copy1;"
sql_create = f"""
CREATE TABLE `{self.site_name}_st_year_week_copy1` (
`id` int(10) NOT NULL AUTO_INCREMENT COMMENT 'id',
`search_term` varchar(500) NOT NULL COMMENT '搜索词',
`year_week` varchar(10) DEFAULT NULL COMMENT '年-周',
`created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`updated_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
KEY `seq_year_week` (`year_week`) USING BTREE,
KEY `seq_search_term` (`search_term`,`year_week`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4;
""".replace("`", "")
conn.execute(sql_drop)
conn.execute(sql_create)
print("2. 导出")
os.system(f"{self.path_sh} {self.site_name} {self.year} {self.week}")
def check_data(self):
num = 0
while True:
num += 1
print("开始检查数据")
sql_read = f"SELECT max(year_week) as max_year_week FROM {self.site_name}_st_year_week_copy1;"
df = pd.read_sql(sql_read, con=self.engine)
max_year_week = list(df.max_year_week)[0] if list(df.max_year_week) else ''
print(self.year_week, max_year_week)
if self.year_week <= max_year_week:
print(f"正常导出站点:{self.site_name}的数据")
print("3. 删除正式表,将copy表改成正式表")
with self.engine.begin() as conn:
sql_drop = f"drop table if exists {self.site_name}_st_year_week;"
sql_rename = f"ALTER TABLE {self.site_name}_st_year_week_copy1 RENAME {self.site_name}_st_year_week;"
conn.execute(sql_drop)
conn.execute(sql_rename)
break
else:
if num > 3:
print("检查次数超过3次异常,停止")
break
else:
print(f"第{num}次检查异常,继续导出检查")
self.export_data()
continue
def run(self):
self.export_data()
self.check_data()
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:site_name列表-->all:所有站点
year = sys.argv[2] # 参数2:年
week = sys.argv[3] # 参数2:周
handle_obj = ExportDimStYearWeek(site_name=site_name)
handle_obj.run()
\ No newline at end of file
#! /bin/env bash
# author: ffman
# user_no: wg3491
# describe: dwd_st_info-->站点_bs_category_asin
# params: $1-->site_name_array; $2-->year; $3-->week
# version: 1.0
# create_date: 2022-5-26
# update_date: 2022-5-26
# sqoop执行变量
sqoop=/opt/module/sqoop/bin/sqoop
# 定义脚本传入的变量
year=$2
week=$3
dt=${year}-${week}
# 根据site_name来判断数据库名称
if [ $1 == all ];
then
site_name_array=(us uk de es fr it)
else
site_name_array=($1)
fi
echo "site_name_array: ${site_name_array}"
export_data () {
$sqoop export -D mapred.job.queue.name=spark \
--connect "jdbc:mysql://rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com:3306/${db}?useUnicode=true&characterEncoding=utf-8" \
--username adv_yswg \
--password HCL1zcUgQesaaXNLbL37O5KhpSAy0c \
--table ${mysql_table} \
--input-fields-terminated-by '\001' \
--hcatalog-database big_data_selection \
--hcatalog-table ${hive_table} \
--hcatalog-partition-keys site_name \
--hcatalog-partition-values ${site_name} \
--input-null-string '\\N' \
--input-null-non-string '\\N' \
--columns ${cols} \
--num-mappers 20
}
for site_name in ${site_name_array[*]}
do
if [ $site_name == us ];
then
db=selection
else
db=selection_$site_name
fi
echo "db: ${db}"
mysql_table=${site_name}_st_year_week_copy1
hive_table=dim_st_year_week
cols="search_term,year_week"
echo "当前导出的hive表:${hive_table}, mysql表: ${mysql_table}"
export_data
done
import os
import sys
import pandas as pd
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates_mysql import TemplatesMysql
class ExportDwtStInfo(TemplatesMysql):
def __init__(self, site_name='us', date_type="week", date_info='2022-1'):
"""
默认导入所有站点的所有data_type类型的表
"""
super().__init__()
self.site_name = site_name
if self.site_name == "all":
self.site_name_list = ["us", "uk", "de", "es", "fr", "it"]
else:
self.site_name_list = [self.site_name]
self.date_type = date_type
self.date_info = date_info
if self.date_type in ["week", "4_week"]:
self.year_week = date_info
self.year = int(self.year_week.split("-")[0])
self.week = int(self.year_week.split("-")[-1])
self.params_sql = f" and week={self.week}"
self.table_save = f"{self.site_name}_brand_st_info_{self.date_type}"
self.table_update = f"{self.site_name}_brand_analytics_{self.year}"
else:
if self.date_type in ["month"]:
self.year_month = date_info
self.year = int(self.year_month.split("-")[0])
self.month = int(self.year_month.split("-")[-1])
self.params_sql = f" and month={self.month}"
else:
self.year_quarter = date_info
self.year = int(self.year_quarter.split("-")[0])
self.quarter = int(self.year_quarter.split("-")[-1])
self.params_sql = f" and quarter={self.quarter}"
self.table_save = f"{self.site_name}_brand_st_info_{self.date_type}"
self.table_update = f"{self.site_name}_brand_analytics_{self.date_type}"
self.path_sh = f"/opt/module/spark/demo/sqoop_script/dwd/export_dwt_st_info.sh"
self.df_table_counts = pd.DataFrame()
def export_data(self, site_name):
with self.engine.begin() as conn:
sql_delete = f"delete from {self.table_save} where year={self.year} {self.params_sql}"
conn.execute(sql_delete)
print(f"开始导出{site_name}站点的数据")
os.system(f"{self.path_sh} {site_name} {self.date_type} {self.date_info.split('-')[0]} {self.date_info.split('-')[1]}")
# os.system(f"{self.path_sh} {site_name} {self.date_type} {self.year} {self.week}")
def check_data(self, site_name):
self.site_name = site_name
self.engine = self.mysql_connect()
sql_read = f"select count(*) as table_counts from {self.table_save} where year={self.year} {self.params_sql}"
self.df_table_counts = pd.read_sql(sql_read, con=self.engine)
table_counts = list(self.df_table_counts.table_counts)[0]
print("table_counts:", table_counts)
if table_counts == 0:
self.export_data(site_name=site_name)
def update_data(self):
with self.engine.begin() as conn:
# conn.execute(f"set @week={self.week};")
print(f"1. {self.site_name}--更新ao_val等9大指标") # , a.orders=b.st_search_sum 暂时舍弃,需要重新计算
if self.date_type in ["week", "4_week"]:
params = f"b.week={self.week} and b.year={self.year}"
elif self.date_type in ["month"]:
params = f"b.month={self.month} and b.year={self.year}"
else:
params = f"b.quarter={self.quarter} and b.year={self.year}"
sql_update_1 = f"""UPDATE {self.table_update} a, {self.table_save} b
set a.ao_val=b.st_ao_val, a.ao_val_rank=b.st_ao_val_rank, a.ao_val_rate=b.st_ao_val_rate, a.is_ascending_text=b.st_is_ascending_text, a.is_search_text=b.st_is_search_text,
a.quantity_being_sold=b.st_quantity_being_sold,
a.bsr_orders=b.st_asin_bs_orders_sum, a.category_id=b.st_asin_bs_cate_current_id
WHERE {params} and a.id=b.st_brand_id;"""
print(f"sql_update_1:", sql_update_1)
conn.execute(sql_update_1)
print(f"2. {self.site_name}--更新is_first_text")
sql_update_2 = f"""UPDATE {self.table_update} a, {self.table_save} b
set a.is_first_text=b.st_is_first_text WHERE a.rank<=700000 and {params} and b.st_is_first_text=1 and a.id=b.st_brand_id;"""
print(f"sql_update_2:", sql_update_2)
conn.execute(sql_update_2)
print(f"3. 更新工作流表workflow_exhibition的data_type=7(ao_val计算)的状态为3")
sql_update_3 = f"update selection.workflow_exhibition set status=3 " \
f"where week='{self.year}_{self.week}' and site_name='{self.site_name}' and data_type=7"
conn.execute(sql_update_3)
def run(self):
for site_name in self.site_name_list:
self.site_name = site_name
if self.date_type in ["week", "4_week"]:
self.year_week = date_info
self.year = int(self.year_week.split("-")[0])
self.week = int(self.year_week.split("-")[-1])
self.params_sql = f" and week={self.week}"
self.table_save = f"{self.site_name}_brand_st_info_{self.date_type}"
# if self.site_name == 'us':
# self.table_save = f"{self.site_name}_brand_st_info_{self.date_type}_copy1"
self.table_update = f"{self.site_name}_brand_analytics_{self.year}"
else:
if self.date_type in ["month"]:
self.year_month = date_info
self.year = int(self.year_month.split("-")[0])
self.month = int(self.year_month.split("-")[-1])
else:
self.year_quarter = date_info
self.year = int(self.year_quarter.split("-")[0])
self.quarter = int(self.year_quarter.split("-")[-1])
self.table_save = f"{self.site_name}_brand_st_info_{self.date_type}"
self.table_update = f"{self.site_name}_brand_analytics_{self.date_type}"
self.engine = self.mysql_connect()
self.export_data(site_name=site_name)
self.check_data(site_name=site_name)
self.update_data()
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:site_name列表-->all:所有站点
date_type = sys.argv[2] # 参数2:类型:week/4_week/month/quarter
date_info = sys.argv[3] # 参数2:年-周, 比如: 2022-1
# handle_obj = ExportDwtStInfo(site_name_flag=site_name_flag, year_week=year_week)
handle_obj = ExportDwtStInfo(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
\ No newline at end of file
import os
import sys
import pandas as pd
from templates_mysql import TemplatesMysql
class ExportStInfo(TemplatesMysql):
def __init__(self, site_name_flag='all', year_week='2022-1'):
"""
默认导入所有站点的所有data_type类型的表
"""
super().__init__()
if site_name_flag == 'all':
self.site_name_list = ['us', 'uk', 'de', 'es', 'fr', 'it']
else:
self.site_name_list = [site_name_flag]
self.year_week = year_week
self.year = int(self.year_week.split("-")[0])
self.week = int(self.year_week.split("-")[-1])
self.path_sh = f"/opt/module/spark/demo/py_demo/demo_sqoop/export_st_info.sh"
self.df_table_counts = pd.DataFrame()
def export_data(self, site_name):
with self.engine.begin() as conn:
sql_delete = f"delete from {self.site_name}_brand_st_info where week={self.week}"
conn.execute(sql_delete)
print(f"开始导出{site_name}站点的数据")
os.system(f"{self.path_sh} {site_name} {self.year} {self.week}")
def check_data(self, site_name):
self.site_name = site_name
self.engine = self.mysql_connect()
sql_read = f"select count(*) as table_counts from {self.site_name}_brand_st_info where week={self.week}"
self.df_table_counts = pd.read_sql(sql_read, con=self.engine)
table_counts = list(self.df_table_counts.table_counts)[0]
print("table_counts:", table_counts)
if table_counts == 0:
self.export_data(site_name=site_name)
def update_data(self):
with self.engine.begin() as conn:
conn.execute(f"set @week={self.week};")
print(f"1. {self.site_name}--更新ao_val")
sql_update_1 = f"""UPDATE {self.site_name}_brand_analytics_{self.year} a, {self.site_name}_brand_st_info b
set a.ao_val=b.st_ao_val WHERE b.week={self.week} and b.st_ao_val>0 and a.id=b.st_brand_id;"""
conn.execute(sql_update_1)
print(f"2. {self.site_name}--更新is_first_text")
sql_update_2 = f"""UPDATE {self.site_name}_brand_analytics_{self.year} a, {self.site_name}_brand_st_info b
set a.is_first_text=b.st_is_first_text WHERE a.rank<=700000 and b.week={self.week} and b.st_is_first_text=1 and a.id=b.st_brand_id;"""
conn.execute(sql_update_2)
def run(self):
for site_name in self.site_name_list:
self.export_data(site_name=site_name)
self.check_data(site_name=site_name)
self.update_data()
if __name__ == '__main__':
site_name_flag = sys.argv[1] # 参数1:site_name列表-->all:所有站点
year_week = sys.argv[2] # 参数2:年-周, 比如: 2022-1
handle_obj = ExportStInfo(site_name_flag=site_name_flag,
year_week=year_week)
handle_obj.run()
\ No newline at end of file
#! /bin/env bash
# author: ffman
# user_no: wg3491
# describe: dwd_st_info-->站点_brand_st_info
# params: $1-->site_name_array; $2-->year; $3-->week
# version: 2.0
# create_date: 2022-3-22
# update_date: 2022-5-18
# sqoop执行变量
sqoop=/opt/module/sqoop/bin/sqoop
# 定义脚本传入的变量
year=$2
week=$3
dt=${year}-${week}
# 根据site_name来判断数据库名称
if [ $1 == all ];
then
site_name_array=(us uk de es fr it)
else
site_name_array=($1)
fi
echo "site_name_array: ${site_name_array}"
export_data () {
$sqoop export -D mapred.job.queue.name=spark \
--connect "jdbc:mysql://rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com:3306/${db}?useUnicode=true&characterEncoding=utf-8" \
--username adv_yswg \
--password HmRCMUjt03M33Lze \
--table ${mysql_table} \
--input-fields-terminated-by '\001' \
--hcatalog-database big_data_selection \
--hcatalog-table ${hive_table} \
--hcatalog-partition-keys site_name,dt \
--hcatalog-partition-values ${site_name},${dt} \
--input-null-string '\\N' \
--input-null-non-string '\\N' \
--columns ${cols} \
--num-mappers 3
}
for site_name in ${site_name_array[*]}
do
if [ $site_name == us ];
then
db=selection
else
db=selection_$site_name
fi
echo "db: ${db}"
mysql_table=${site_name}_brand_st_info
hive_table=dwd_st_info
cols="st_brand_id,st_is_first_text,st_ao_val,week,year"
echo "当前导出的hive表:${hive_table}, mysql表: ${mysql_table}"
export_data
done
import os
import sys
from hdfs.client import Client
class ImportStDataType(object):
def __init__(self, site_name_flag='all', data_type_flag='all', year_week='2022-1'):
"""
默认导入所有站点的所有data_type类型的表
"""
self.client = Client("http://hadoop1:50070/")
if site_name_flag == 'all':
self.site_name_list = ['us', 'uk', 'de', 'es', 'fr', 'it']
else:
self.site_name_list = [site_name_flag]
if data_type_flag == 'all':
self.data_type_list = ['zr', 'sp', 'sb', 'ac', 'bs', 'er', 'tr']
else:
self.data_type_list = [data_type_flag]
self.year_week = year_week
self.year = int(self.year_week.split("-")[0])
self.week = int(self.year_week.split("-")[-1])
self.path_default = "/home/ffman/ods"
def run(self):
for site_name in self.site_name_list:
for data_type in self.data_type_list:
hdfs_path = f"{self.path_default}/ods_search_term_rank_{data_type}/site_name={site_name}/dt={self.year_week}"
for num in range(1, 4):
if num >= 2:
print(f"第{num}次重复导入")
try:
file_list = self.client.list(hdfs_path=hdfs_path, status=True)
print(f"{site_name} {data_type} {self.year} {self.week}: file_list--{len(file_list)}")
if len(file_list) == 0:
os.system(
f"/opt/module/spark/demo/py_demo/demo_sqoop/import_st_data_type.sh {site_name} {data_type} {self.year} {self.week}")
continue
else:
break
except Exception as e:
print(f"{site_name}, {data_type} isn't exists: {hdfs_path}", e)
os.system(
f"/opt/module/spark/demo/py_demo/demo_sqoop/import_st_data_type.sh {site_name} {data_type} {self.year} {self.week}")
continue
if __name__ == '__main__':
site_name_flag = sys.argv[1] # 参数1:site_name列表-->all:所有站点
data_type_flag = sys.argv[2] # 参数1:data_type列表-->all:所有类型
year_week = sys.argv[3] # 参数2:年-周, 比如: 2022-1
handle_obj = ImportStDataType(site_name_flag=site_name_flag,
data_type_flag=data_type_flag,
year_week=year_week)
handle_obj.run()
#! /bin/env bash
# author: ffman
# usage: 同步各个站点的zr,sp,sb,ac,bs,er,tr表
# params: $1-->site_name_array; $2-->data_type_array; $3-->year; $4-->week
# version: 3.0
# create_date:2022-3-7
# update_date:2022-5-18
if [ $1 == all ];
then
site_name_array=(us uk de es fr it)
else
site_name_array=($1)
fi
if [ $2 == all ];
then
data_type_array=(zr sp sb ac bs er tr)
else
data_type_array=($2)
fi
year=$3
week=$4
dt=${year}-${week}
echo "site_name_array: ${site_name_array}, year: ${year}, week: ${week}"
import_data(){
/opt/module/sqoop-1.4.6/bin/sqoop import -D mapred.job.queue.name=default -D mapred.task.timeout=0 --append \
--connect jdbc:mysql://rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com:3306/${db} \
--username adv_yswg \
--password HmRCMUjt03M33Lze \
--target-dir ${hdfs_path} \
--query "select * from ${mysql_table} where 1=1 and \$CONDITIONS" \
--fields-terminated-by '\t' \
--compress \
--compression-codec lzop \
--m 1
}
for site_name in ${site_name_array[*]}
do
echo 1. 当前连接的mysql数据库站点: ${site_name}
if [ $site_name == us ];
then
db=selection
else
db=selection_$site_name
fi
echo "db: ${db}"
for data_type in ${data_type_array[*]}
do
echo 2. 先删除已经存在的分区: ${site_name}, ${data_type}
hdfs_path=/home/ffman/ods/ods_search_term_rank_${data_type}/site_name=${site_name}/dt=${dt}
hdfs dfs -rm -r ${hdfs_path}/*
echo 3. 导入数据
mysql_table=${site_name}_search_term_rank_${data_type}_${year}_${week}
echo "mysql_table: ${mysql_table}"
import_data
echo 4. 建立lzo索引
hadoop jar \
/opt/module/hadoop/share/hadoop/common/hadoop-lzo-0.4.20.jar \
com.hadoop.compression.lzo.DistributedLzoIndexer \
${hdfs_path}
echo 5. 恢复外部表的元数据
hive -e "MSCK REPAIR TABLE big_data_selection.ods_search_term_rank_${data_type}"
done
done
import time
import traceback
from sqlalchemy import create_engine
class TemplatesMysql(object):
def __init__(self, site_name='us'):
self.DB_CONN_DICT = {
"mysql_port": "3306",
"mysql_db": "selection",
"mysql_user": "adv_yswg",
# "mysql_pwd": "S4FeR09bFF441lTz",
"mysql_pwd": "Ty11ky169s120wxO15qz",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
self.site_name = site_name
self.engine = self.mysql_connect()
def mysql_connect(self):
"""
Connection of mysql.
"""
nums = 0
while True:
nums += 1
try:
if self.site_name == 'us':
db = 'selection'
else:
db = f'selection_{self.site_name}'
return create_engine(
f'mysql+pymysql://{self.DB_CONN_DICT["mysql_user"]}:' + f'{self.DB_CONN_DICT["mysql_pwd"]}@{self.DB_CONN_DICT["mysql_host"]}:{self.DB_CONN_DICT["mysql_port"]}/{db}?charset=utf8mb4') # , pool_recycle=3600
except Exception as e:
print("error_mysql_connect:", e, f"\n{traceback.format_exc()}")
time.sleep(nums * 20)
continue
def mysql_reconnect(self, table_name=None, e=None):
"""
Repeated connection of mysql.
"""
print(f"{table_name}表报错,等待5s继续:", e, f"\n{traceback.format_exc()}")
self.engine = self.mysql_connect()
time.sleep(5)
import os
import sys
import re
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
# 导入udf公共方法
from yswg_utils.common_udf import udf_parse_bs_category
# from ..yswg_utils.common_udf import udf_parse_bs_category
from utils.spark_util import SparkUtil
from utils.hdfs_utils import HdfsUtils
class DimAsinAmordersInfo(Templates):
def __init__(self, site_name='us', date_type="month", date_info='2022-1'):
super().__init__()
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
# 初始化self.spark对
self.db_save = 'dim_asin_amorders_info'
self.spark = self.create_spark_object(
app_name=f"{self.db_save}: {self.site_name}, {self.date_type}, {self.date_info}")
self.get_year_week_tuple()
self.df_save = self.spark.sql("select 1+1;")
self.df_asin_amazon_orders = self.spark.sql("select 1+1;")
self.df_asin_detail = self.spark.sql("select 1+1;")
self.partitions_by = ['site_name', 'date_type', 'date_info']
self.reset_partitions(partitions_num=10)
self.u_parse_amazon_orders = self.spark.udf.register('u_parse_amazon_orders', self.udf_parse_amazon_orders, IntegerType())
self.hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dim/{self.db_save}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
def read_data(self):
# us month, month_week, 4_week, week
# uk/de month, 4_week, week
if self.site_name in ['us', 'uk', 'de']:
if self.date_type in ['month', 'month_week']:
if (self.site_name == 'us') or (self.site_name in ['uk', 'de'] and self.date_info >= '2024-05'):
params = f"date_type='{self.date_type}' and date_info = '{self.date_info}'"
else:
params = f"date_type='week' and date_info in {self.year_week_tuple}"
else:
params = f"date_type='week' and date_info in {self.year_week_tuple}"
else:
params = f"date_type='week' and date_info in {self.year_week_tuple}"
sql = f"select asin, buy_data as asin_amazon_orders_str, created_time, 2 as asin_amazon_orders_label " \
f"from ods_other_search_term_data where site_name='{self.site_name}' and {params} and buy_data is not null;" # and date_info>='2023-15'
print(f"1. 读取ods_other_search_term_data表数据: sql -- {sql}")
self.df_asin_amazon_orders = self.spark.sql(sqlQuery=sql).cache()
self.df_asin_amazon_orders.show(10, truncate=False)
sql = f"select asin, buy_sales as asin_amazon_orders_str, created_at as created_time, 1 as asin_amazon_orders_label " \
f"from ods_asin_detail where site_name='{self.site_name}' and {params} and buy_sales is not null;" # and date_info>='2023-15'
print(f"1. 读取df_asin_detail表数据: sql -- {sql}")
self.df_asin_detail = self.spark.sql(sqlQuery=sql).cache()
self.df_asin_detail.show(10, truncate=False)
@staticmethod
def udf_parse_amazon_orders(asin_amazon_orders_str):
"""
解析asin详情页面的月销字段
"""
pattern = "(\d+[k]{0,})\+"
results_list = re.findall(pattern, str(asin_amazon_orders_str).lower())
if len(results_list) == 1:
result = int(results_list[0].replace("k", "000").replace(" ", ""))
else:
result = None
return result
def handle_data(self):
# 处理关键词页面的asin月销数据
window = Window.partitionBy(['asin']).orderBy(
self.df_asin_amazon_orders.asin_amazon_orders_str.desc_nulls_last(),
self.df_asin_amazon_orders.created_time.desc_nulls_last(),
)
self.df_asin_amazon_orders = self.df_asin_amazon_orders.withColumn("rk", F.row_number().over(window=window))
self.df_asin_amazon_orders = self.df_asin_amazon_orders.filter("rk=1").drop("rk").cache()
self.df_asin_amazon_orders.show(10, truncate=False)
# join
self.df_save = self.df_asin_detail.unionByName(self.df_asin_amazon_orders, allowMissingColumns=True)
# 处理asin详情页面的asin月销数据
window = Window.partitionBy(['asin']).orderBy(
self.df_save.asin_amazon_orders_str.desc_nulls_last(),
self.df_save.created_time.desc_nulls_last(),
)
self.df_save = self.df_save.withColumn("rk", F.row_number().over(window=window))
self.df_save = self.df_save.filter("rk=1").drop("rk").cache()
# 窗口函数还是会有重复
# self.df_save = self.df_save.dropDuplicates(['asin'])
# 解析亚马逊月销字段
self.df_save = self.df_save.withColumn('asin_amazon_orders', self.u_parse_amazon_orders('asin_amazon_orders_str'))
self.df_save = self.df_save.withColumn("site_name", F.lit(self.site_name))
self.df_save = self.df_save.withColumn("date_type", F.lit(self.date_type))
self.df_save = self.df_save.withColumn("date_info", F.lit(self.date_info))
self.df_save.show(10, truncate=False)
self.df_save.filter("asin_amazon_orders is not null").show(10, truncate=False)
self.df_save = self.df_save.withColumn('asin_amazon_orders', self.u_parse_amazon_orders('asin_amazon_orders_str'))
self.df_save.filter("asin_amazon_orders is not null").show(10, truncate=False)
print(f"清除hdfs目录中.....{self.hdfs_path}")
HdfsUtils.delete_file_in_folder(self.hdfs_path)
print(f"当前存储的表名为:{self.db_save},分区为{self.partitions_by}")
# self.df_save.write.saveAsTable(name=self.db_save, format='hive', mode='append', partitionBy=self.partitions_by)
# print("success")
# quit()
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
date_type = sys.argv[2] # 参数2:类型:week/4_week/month/quarter
date_info = sys.argv[3] # 参数3:年-周/年-月/年-季, 比如: 2022-1
handle_obj = DimAsinAmordersInfo(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
import os
import re
import sys
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
from pyspark.sql.types import StringType, StructType, StructField, IntegerType
# 分组排序的udf窗口函数
from pyspark.sql.window import Window
from pyspark.sql import functions as F
class DimAsinBsCategoryDesc(Templates):
def __init__(self, site_name='us'):
super().__init__()
self.site_name = site_name
self.db_save = f'dim_asin_bs_category_desc'
self.spark = self.create_spark_object(app_name=f"{self.db_save}: {self.site_name}")
self.df_bs_detail = self.spark.sql(f"select 1+1;")
# 分区参数
self.partitions_by = ['site_name']
self.partitions_num = 100
schema = StructType([
StructField('bs_rank_str', StringType(), True),
StructField('bs_category_str', StringType(), True),
])
self.u_rank_and_category = self.spark.udf.register("u_rank_and_category", self.udf_rank_and_category, schema)
@staticmethod
def udf_rank_and_category(best_sellers_rank):
pattern = r"#([\d,]+) in ([\w&' ]+)"
matches = re.findall(pattern, best_sellers_rank)
bs_rank_str = ",".join([rank.replace(",", "") for rank, category in matches])
bs_category_str = ",".join([category.strip().replace(",", " ") for rank, category in matches])
return bs_rank_str, bs_category_str
def read_data(self):
sql = f"select asin, best_sellers_rank, last_herf, site_name, date_info from ods_bs_category_asin_detail where site_name='{self.site_name}' and date_type='week'"
print("sql:", sql)
self.df_bs_detail = self.spark.sql(sql).cache()
self.df_bs_detail.show(10, truncate=False)
def handle_data(self):
# 去重取最新
window = Window.partitionBy(['asin']).orderBy(F.desc("date_info"))
self.df_bs_detail = self.df_bs_detail.withColumn(f"row_number", F.row_number().over(window)) \
.filter(f'row_number = 1')
# 提取排名和分类
self.df_bs_detail = self.df_bs_detail.withColumn(
'bs_str', self.u_rank_and_category('best_sellers_rank')
)
self.df_bs_detail = self.df_bs_detail.withColumn('bs_rank_str', self.df_bs_detail.bs_str.getField('bs_rank_str')) \
.withColumn('bs_category_str', self.df_bs_detail.bs_str.getField('bs_category_str')) \
.drop('bs_str')
self.df_bs_detail.show(10, truncate=False)
self.df_save = self.df_bs_detail
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
handle_obj = DimAsinBsCategoryDesc(site_name=site_name)
handle_obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil, DateTypes
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from pyspark.sql.window import Window
from pyspark.sql import functions as F
class DimAsinBuyData(object):
def __init__(self, site_name):
self.site_name = site_name
app_name = f"{self.__class__.__name__}:{site_name}"
self.spark = SparkUtil.get_spark_session(app_name)
self.hive_table = "dim_asin_buy_data"
self.hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dim/{self.hive_table}/site_name={self.site_name}"
self.partitions_num = CommonUtil.reset_partitions(site_name, 10)
self.df_asin_buy_data = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
def run(self):
# 读取ods_other_search_term_data
sql = f"""
select
asin,buy_data,updated_time
from ods_other_search_term_data
where site_name = '{self.site_name}'
and date_type = 'month';
"""
print(sql)
self.df_asin_buy_data = self.spark.sql(sqlQuery=sql).cache()
# 去重保留每个asin的最新记录
window = Window.partitionBy(self.df_asin_buy_data.asin).orderBy(
self.df_asin_buy_data.updated_time.desc_nulls_last()
)
self.df_asin_buy_data = self.df_asin_buy_data.withColumn("rk", F.row_number().over(window=window))
self.df_asin_buy_data = self.df_asin_buy_data.filter("rk=1")
# 补全字段
self.df_save = self.df_asin_buy_data.select(
F.col('asin'),
F.col('buy_data'),
F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS').alias('created_time'),
F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS').alias('updated_time'),
F.lit(self.site_name).alias("site_name")
)
self.df_save = self.df_save.repartition(self.partitions_num)
partition_by = ["site_name"]
print(f"清除hdfs目录中.....{self.hdfs_path}")
HdfsUtils.delete_file_in_folder(self.hdfs_path)
print(f"当前存储的表名为:{self.hive_table},分区为{partition_by}")
self.df_save.write.saveAsTable(name=self.hive_table, format='hive', mode='append', partitionBy=partition_by)
print("success")
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
obj = DimAsinBuyData(site_name)
obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
"""
asin下架实时表
"""
class DimAsinErrState(object):
def __init__(self, site_name):
self.site_name = site_name
app_name = f"{self.__class__.__name__}:{site_name}"
self.spark = SparkUtil.get_spark_session(app_name)
self.hive_tb = "dim_asin_err_state"
def run(self):
now_date = CommonUtil.format_now("%Y-%m-%d")
day_30_before = CommonUtil.get_day_offset(now_date, -30)
# 30天内下架了的asin
sql = f"""
select asin,
date_info as asin_unlaunch_time
from dwd_day_asin
where site_name = '{site_name}'
and date_info >= '{day_30_before}'
and craw_state = 2
"""
df_day = self.spark.sql(sql).cache()
sql_all = f"""
select asin, date_format(updated_at, 'yyyy-MM-dd') as asin_unlaunch_time
from ods_asin_err_state
where site_name = '{site_name}'
"""
df_save = self.spark.sql(sql_all)
df_save = df_save.unionByName(df_day).groupby("asin").agg(
F.max(F.col("asin_unlaunch_time")).alias("asin_unlaunch_time"),
F.lit(self.site_name).alias("site_name")
)
partition_dict = {
"site_name": self.site_name,
}
# 设置分区块为2
df_save = df_save.repartition(2)
CommonUtil.save_or_update_table(
spark_session=self.spark,
hive_tb_name=self.hive_tb,
partition_dict=partition_dict,
df_save=df_save
)
print("success")
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
obj = DimAsinErrState(site_name)
obj.run()
"""
1. 上架日期
2. 分类id
"""
"""
author: 方星钧(ffman)
description: 基于ods_asin_detail历史表,计算出asin的历史数据指标(上架时间+bs分类id)
table_read_name: ods_asin_detail, , selection_off_line.dwd_bs_category_asin
table_save_name: dim_asin_history_info
table_save_level: dwd
version: 1.0
created_date: 2022-06-20
updated_date: 2022-06-20
"""
import os
import sys
from pyspark.storagelevel import StorageLevel
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
# from AmazonSpider.pyspark_job.utils.templates import Templates
# 分组排序的udf窗口函数
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType
class DimAsinHistoryInfo(Templates):
def __init__(self, site_name="us", date_type="week", date_info="2022-1"):
super().__init__()
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
self.db_save = f"dim_asin_history_info"
self.spark = self.create_spark_object(app_name=f"{self.db_save} {self.site_name}, {self.date_info}")
self.df_date = self.get_year_week_tuple()
self.df_asin_detail = self.spark.sql(f"select 1+1;")
self.df_bs_category = self.spark.sql(f"select 1+1;")
self.df_bs_category_report = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
self.year_month = list(self.df_date.loc[self.df_date.year_week == f'{self.year_week}'].year_month)[0]
# self.reset_partitions_by()
self.partitions_by = ['site_name']
self.reset_partitions(20)
self.partitions_type = "dt"
self.u_year_week = self.spark.udf.register('u_year_week', self.udf_year_week, StringType())
@staticmethod
def udf_year_week(dt):
year, week = dt.split("-")[0], dt.split("-")[1]
if int(week) < 10:
return f"{year}-0{week}"
else:
return f"{year}-{week}"
def read_data(self):
print("1.1 读取ods_asin_detail和ods_keep_date表")
sql = f"select asin, launch_time, site_name, dt, 1 as type, rank, price, rating, total_comments from ods_asin_detail " \
f"where site_name='{self.site_name}' " \
f"union " \
f"select asin, launch_time, site_name, dt, 2 as type, null as rank, null as price, null as rating, null as total_comments from ods_keep_date " \
f"where site_name='{self.site_name}';"
self.df_asin_detail = self.spark.sql(sqlQuery=sql).cache()
self.df_asin_detail.show(10, truncate=False)
print("1.2 读取dwd_bs_category_asin表")
sql = f"select asin, cate_1_id, cate_current_id, dt from selection_off_line.dwd_bs_category_asin " \
f"where site='{self.site_name}' and dt !='9999-99';" # '9999-99'分区是去重不符合
self.df_bs_category = self.spark.sql(sqlQuery=sql).cache()
self.df_bs_category.show(10, truncate=False)
print("1.3 读取ods_one_category_report表")
sql = f"select cate_1_id, rank, orders from ods_one_category_report " \
f"where site_name='{self.site_name}' and dm='{self.year_month}';"
print("sql:", sql)
self.df_bs_category_report = self.spark.sql(sqlQuery=sql).cache()
self.df_bs_category_report.show(10, truncate=False)
def handle_data(self):
self.handle_data_duplicated()
self.handle_data_join()
self.handle_data_renamed()
self.df_save.show(10, truncate=False)
def handle_data_duplicated(self):
print("2.1 根据asin,dt去重")
self.df_asin_detail = self.df_asin_detail.withColumn(
"dt_sort", self.u_year_week(self.df_asin_detail.dt)
)
self.df_bs_category = self.df_bs_category.withColumn(
"dt_sort", self.u_year_week(self.df_bs_category.dt)
)
window = Window.partitionBy(['asin']).orderBy(
self.df_asin_detail.type.asc_nulls_last(),
self.df_asin_detail.dt_sort.desc()
)
self.df_asin_detail = self.df_asin_detail.withColumn("asin_dt_top", F.row_number().over(window=window))
self.df_asin_detail = self.df_asin_detail.filter("asin_dt_top=1")
# 合并keep_date的launch_time数据
# 这种写法会卡住
# asin_list = self.df_asin_detail.rdd.map(lambda x: x[0]).collect()
# self.df_keep_date = self.df_keep_date.filter(~self.df_keep_date.asin.isin(asin_list))
# self.df_asin_detail = self.df_asin_detail.unionByName(self.df_keep_date)
window = Window.partitionBy(['asin']).orderBy(
self.df_bs_category.dt_sort.desc()
)
self.df_bs_category = self.df_bs_category.withColumn("asin_dt_top", F.row_number().over(window=window))
self.df_bs_category = self.df_bs_category.filter("asin_dt_top=1")
self.df_asin_detail.groupby('dt_sort').count().show(30, truncate=False)
self.df_bs_category.groupby('dt_sort').count().show(30, truncate=False)
self.df_asin_detail = self.df_asin_detail.drop("asin_dt_top", "type", "dt_sort")
self.df_bs_category = self.df_bs_category.drop("asin_dt_top", "dt", "dt_sort")
def handle_data_join(self):
self.df_save = self.df_asin_detail.join(
self.df_bs_category, on=['asin'], how='left'
).join(
self.df_bs_category_report, on=['rank', 'cate_1_id'], how='left'
)
# self.df_save = self.df_save.withColumn(f"{self.partitions_type}", F.lit(self.date_info))
# quit()
def handle_data_renamed(self):
self.df_save = self.df_save.\
withColumnRenamed("launch_time", "asin_launch_time").\
withColumnRenamed("cate_1_id", "asin_bs_cate_1_id").\
withColumnRenamed("cate_current_id", "asin_bs_cate_current_id").\
withColumnRenamed("rank", "asin_rank").\
withColumnRenamed("price", "asin_price"). \
withColumnRenamed("rating", "asin_rating"). \
withColumnRenamed("total_comments", "asin_total_comments").\
withColumnRenamed("orders", "asin_bs_orders")
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
# date_type = sys.argv[2] # 参数2:类型:week/4_week/month/quarter
date_info = sys.argv[2] # 参数3:年-周/年-月/年-季, 比如: 2022-1
# handle_obj = DwdAsinHistoryInfo(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj = DimAsinHistoryInfo(site_name=site_name, date_info=date_info)
handle_obj.run()
\ No newline at end of file
import os
import sys
import re
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
class DimAsinImageTmp(Templates):
def __init__(self, site_name='us', date_type="month", date_info='2022-1'):
super().__init__()
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
# 初始化self.spark对
self.db_save = 'dim_asin_image_info'
self.spark = self.create_spark_object(app_name=f"{self.db_save}: {self.site_name}, {self.date_type}, {self.date_info}")
self.u_asin_to_number = F.udf(self.udf_asin_to_number, IntegerType())
self.df_save = self.spark.sql("select 1+1;")
# self.partitions_by = ['site_name', 'date_type', 'date_info']
self.partitions_by = ['site_name']
self.partitions_dict = {
"site_name": site_name
}
self.reset_partitions(partitions_num=100)
@staticmethod
# 将asin转换成数值--从而可以划分指定分区表
def udf_asin_to_number(asin):
"""
Convert a 10-character ASIN string to a unique number.
This function assumes that ASIN consists of uppercase letters and digits.
"""
def char_to_number(char):
if char.isdigit():
return int(char)
else:
return ord(char) - 55 # 'A' -> 10, 'B' -> 11, ..., 'Z' -> 35
if len(asin) != 10:
raise ValueError("ASIN must be 10 characters long")
base = 36
asin_number = 0
for i, char in enumerate(reversed(asin)):
asin_number += char_to_number(char) * (base ** i)
# The final number is taken modulo 1 billion to fit the range 1-10 billion
return asin_number % 1000000000
def read_data(self):
sql = f"select * from ods_asin_image where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info in ('0000-00', '{self.date_info}');"
print(f"sql: {sql}")
self.df_save = self.spark.sql(sql).cache()
self.df_save.show(10)
self.df_save = self.df_save.withColumn("mapped_asin", self.u_asin_to_number("asin"))
# self.df_save.show(10)
def handle_data(self):
window = Window.partitionBy(['asin', 'data_type', 'img_order_by']).orderBy(
self.df_save.created_at.desc(),
)
self.df_save = self.df_save.withColumn(
"row_number", F.row_number().over(window=window)
)
self.df_save = self.df_save.filter("row_number=1").drop("row_number")
# self.df_save = self.df_save.withColumn("date_type", F.lit(self.date_type))
# self.df_save = self.df_save.withColumn("date_info", F.lit(self.date_info))
hdfs_path = CommonUtil.build_hdfs_path(self.db_save, partition_dict=self.partitions_dict)
print(f"当前存储的表名为:{self.db_save},分区为{self.partitions_dict}")
print(f"清除hdfs目录中.....{hdfs_path}")
HdfsUtils.delete_file_in_folder(hdfs_path)
self.df_save.show(10)
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
date_type = sys.argv[2] # 参数2:类型:week/4_week/month/quarter
date_info = sys.argv[3] # 参数3:年-周/年-月/年-季, 比如: 2022-1
handle_obj = DimAsinImageTmp(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
\ No newline at end of file
import os
import sys
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
from pyspark.sql import functions as F
from pyspark.sql.window import Window
class DimAsinImgInfo(Templates):
def __init__(self, site_name='us'):
super().__init__()
self.site_name = site_name
self.db_save = f'dim_asin_img_info'
self.spark = self.create_spark_object(app_name=f"{self.db_save}: {self.site_name}")
self.df_asin_img = self.spark.sql(f"select 1+1;")
self.df_asin_truncate = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
self.partitions_by = ['site_name']
self.reset_partitions(100)
def read_data(self):
sql = f"select asin, asin_img_url, asin_img_path, asin_trun_1, asin_trun_2, asin_trun_3, asin_trun_4, asin_trun_5, " \
f"asin_trun_6, asin_trun_7, asin_trun_8, asin_trun_9, date_info_img_url as date_info, site_name " \
f"from dim_asin_stable_info where site_name='{self.site_name}';"
print("sql:", sql)
self.df_asin_img = self.spark.sql(sql).cache()
self.df_asin_img.show(10, truncate=False)
# print(111, self.df_asin_img.count())
self.df_asin_img = self.df_asin_img.filter("asin_img_url is not null")
# print(222, self.df_asin_img.count())
sql = f"select asin, asin_img_url, 3 as state " \
f"from dim_cal_asin_truncate where site_name='{self.site_name}' and asin_img_url is not null;"
print("sql:", sql)
self.df_asin_truncate = self.spark.sql(sql).cache()
self.df_asin_truncate.show(10, truncate=False)
print(self.df_asin_truncate.count())
def handle_data(self):
self.df_save = self.df_asin_img.join(
self.df_asin_truncate, on=['asin', 'asin_img_url'], how='left'
)
self.df_save = self.df_save.fillna({'state': 1})
# self.df_save.show(10, truncate=False)
# print(self.df_save.count())
# quit()
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
handle_obj = DimAsinImgInfo(site_name=site_name)
handle_obj.run()
\ No newline at end of file
import os
import sys
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
from pyspark.sql import functions as F
from pyspark.sql.window import Window
class DimAsinImgInfo(Templates):
def __init__(self, site_name='us'):
super().__init__()
self.site_name = site_name
self.db_save = f'dim_asin_img_info'
self.spark = self.create_spark_object(app_name=f"{self.db_save}: {self.site_name}")
self.df_asin_img = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
self.partitions_by = ['site_name']
self.reset_partitions(100)
def read_data(self):
sql = f"select asin, img_url as asin_img_url, date_info " \
f"from ods_asin_detail where site_name='{self.site_name}' and date_type='week';"
print("sql:", sql)
self.df_asin_img = self.spark.sql(sql).cache()
self.df_asin_img.show(10, truncate=False)
def handle_data(self):
self.df_asin_img = self.df_asin_img.filter("asin_img_url is not null")
self.df_asin_img = self.df_asin_img.filter(self.df_asin_img.asin_img_url.contains('amazon')) # 保留包含amazon的字符串记录
window = Window.partitionBy('asin').orderBy(F.desc('date_info')) # 按照 date_info 列进行分区,并按照 date 列进行排序
self.df_asin_img = self.df_asin_img.withColumn('row_number', F.row_number().over(window)) # 使用窗口函数为每个分区的行编号
self.df_asin_img = self.df_asin_img.filter(self.df_asin_img.row_number == 1).drop('row_number') # 只保留每个分区中 row_number 最大的行,并删除 row_number 列
self.df_asin_img = self.df_asin_img.withColumn("asin_trun_1", F.substring(self.df_asin_img.asin, 1, 1))
self.df_asin_img = self.df_asin_img.withColumn("asin_trun_2", F.substring(self.df_asin_img.asin, 1, 2))
self.df_asin_img = self.df_asin_img.withColumn("asin_trun_3", F.substring(self.df_asin_img.asin, 1, 3))
self.df_asin_img = self.df_asin_img.withColumn("asin_trun_4", F.substring(self.df_asin_img.asin, 1, 4))
self.df_asin_img = self.df_asin_img.withColumn("asin_trun_5", F.substring(self.df_asin_img.asin, 1, 5))
self.df_asin_img = self.df_asin_img.withColumn("asin_trun_6", F.substring(self.df_asin_img.asin, 1, 6))
self.df_asin_img = self.df_asin_img.withColumn("asin_trun_7", F.substring(self.df_asin_img.asin, 1, 7))
self.df_asin_img = self.df_asin_img.withColumn("asin_trun_8", F.substring(self.df_asin_img.asin, 1, 8))
self.df_asin_img = self.df_asin_img.withColumn("asin_trun_9", F.substring(self.df_asin_img.asin, 1, 9))
self.df_asin_img = self.df_asin_img.withColumn(
"asin_img_path",
F.concat(
F.lit("/"), self.df_asin_img.asin_trun_1,
F.lit("/"), self.df_asin_img.asin_trun_2,
F.lit("/"), self.df_asin_img.asin_trun_3,
F.lit("/"), self.df_asin_img.asin_trun_4,
F.lit("/"), self.df_asin_img.asin_trun_5,
F.lit("/"), self.df_asin_img.asin_trun_6,
F.lit("/")
)
)
self.df_asin_img = self.df_asin_img.withColumn("state", F.lit(1))
self.df_asin_img = self.df_asin_img.withColumn("site_name", F.lit(self.site_name))
self.df_save = self.df_asin_img
self.df_save.show(10, truncate=False)
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
handle_obj = DimAsinImgInfo(site_name=site_name)
handle_obj.run()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.db_util import DBUtil
from pyspark.sql import functions as F
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
"""
对 asin图片本地存储地址进行存档,同时进行预处理
"""
class DimAsinImgPath(object):
def __init__(self, site_name):
self.site_name = site_name
app_name = f"{self.__class__.__name__}:{site_name}"
self.spark = SparkUtil.get_spark_session(app_name)
self.hive_tb = "dim_asin_img_path"
def run(self):
sql = f"""
select id,
asin,
asin_img_url,
asin_img_path,
created_at,
bsr_cate_current_id
from ods_asin_img_path
where site_name = '{self.site_name}'
and asin_img_url is not null
and asin_img_url != 'null'
and asin_img_path is not null
"""
print("======================查询sql如下======================")
print(sql)
df_save = self.spark.sql(sql)
if df_save.first() == None:
print("============================无数据跳过===================================")
return
path_sql = f"""
select id as bsr_cate_current_id,
category_id
from us_bs_category
"""
conn_info = DBUtil.get_connection_info("mysql", "us")
id_df = SparkUtil.read_jdbc_query(
session=self.spark,
url=conn_info["url"],
pwd=conn_info["pwd"],
username=conn_info["username"],
query=path_sql
)
# todo
df_save = df_save.join(id_df, on='bsr_cate_current_id').select(
F.col("id"),
F.col("asin"),
F.col("asin_img_url"),
F.col("asin_img_path"),
F.col("created_at"),
df_save["bsr_cate_current_id"],
F.col("category_id"),
F.lit(self.site_name).alias("site_name"),
)
partition_dict = {
"site_name": self.site_name
}
hdfs_path = CommonUtil.build_hdfs_path(self.hive_tb, partition_dict=partition_dict)
HdfsUtils.delete_hdfs_file(hdfs_path)
partition_by = list(partition_dict.keys())
print(f"当前存储的表名为:{self.hive_tb},分区为{partition_by}", )
df_save.write.saveAsTable(name=self.hive_tb, format='hive', mode='append', partitionBy=partition_by)
print("success")
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
obj = DimAsinImgPath(site_name)
obj.run()
"""
@Author : HuangJian
@Description : 时间周期内-asin品牌标签(搜索词前3页asin抓取)
@SourceTable :
①ods_other_search_term_data
@SinkTable : dim_asin_label
@CreateTime : 2023/05/04 15:20
@UpdateTime : 2022/05/04 15:20
"""
import os
import sys
import re
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil, DateTypes
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from yswg_utils.common_udf import udf_handle_string_null_value as NullUDF
from functools import reduce
class DimAsinLabel(object):
def __init__(self, site_name, date_type, date_info):
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
self.spark = SparkUtil.get_spark_session(app_name)
self.hive_table = "dim_asin_label"
# 获取周流程的周tuple整合数据
self.complete_date_info_tuple = CommonUtil.transform_week_tuple(self.spark, self.date_type, self.date_info)
self.hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dim/{self.hive_table}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
self.partitions_num = CommonUtil.reset_partitions(site_name, 1)
self.df_date = object() # 需要存储的df数据对象
self.date_sql = self.date_sql_padding()
# 初始化全局df
self.df_asin_label = self.spark.sql(f"select 1+1;")
self.handle_string_num_value = F.udf(NullUDF, StringType())
def date_sql_padding(self):
if 'us' == self.site_name:
if self.date_type == DateTypes.month_week.name:
date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
elif self.date_type == DateTypes.month.name and self.date_info >= '2023-10':
date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
else:
date_sql = f"and date_type='week' and date_info in {self.complete_date_info_tuple}"
elif self.site_name in ['uk', 'de']:
if self.date_type == DateTypes.month.name and self.date_info >= '2024-05':
date_sql = f"and date_type='{self.date_type}' and date_info='{self.date_info}'"
elif self.date_type == DateTypes.month_week.name and self.date_info >= '2024-06':
date_sql = f"and date_type='{self.date_type}' and date_info='{self.date_info}'"
else:
date_sql = f"and date_type='week' and date_info in {self.complete_date_info_tuple}"
print(date_sql)
return date_sql
def run(self):
print("======================查询sql如下======================")
# 读取ods_other_search_term_data
sql = f"""
select
asin, label
from
(select
asin,
lower(label) as label,
created_time,
row_number() over(partition by asin, label order by created_time desc) as crank
from ods_other_search_term_data
where site_name = '{self.site_name}' {self.date_sql}
and trim(label) not in ('null', '')
) t
where t.crank = 1
"""
print(sql)
self.df_asin_label = self.spark.sql(sqlQuery=sql).cache()
# 逻辑处理--多asin多标签采用&&&拼接
self.df_asin_label = self.df_asin_label.groupby(["asin"]).agg(
F.collect_set("label").alias("asin_label_list")
)
movie_label_list = ['prime video', 'dvd', 'blu-ray', 'kindle', 'app', 'paperback', 'audible audiobook',
'kindle edition', 'kindle & comixology', 'hardcover', 'comic', 'multi-format', '4k',
'library binding', 'vinyl', 'audio cd', 'mp3 music', 'single issue magazine',
'print magazine', 'unknown binding']
condition = reduce(
lambda acc, keyword: acc | F.expr(f"exists(asin_label_list, x -> x like '%{keyword}%')"),
movie_label_list,
F.lit(False)
)
self.df_asin_label = self.df_asin_label.withColumn("asin_label_type", condition.cast("int"))
# 对无法标记的标签默认填充为0
self.df_asin_label = self.df_asin_label.na.fill({"asin_label_type": 0})
# 补全分区字段
df_save = self.df_asin_label.select(
F.col('asin'),
self.handle_string_num_value('asin_label_list').alias('asin_label_list'),
F.col('asin_label_type'),
F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS').alias('created_time'),
F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS').alias('updated_time'),
F.lit(self.site_name).alias("site_name"),
F.lit(self.date_type).alias("date_type"),
F.lit(self.date_info).alias("date_info")
)
df_save = df_save.repartition(self.partitions_num)
partition_by = ["site_name", "date_type", "date_info"]
print(f"清除hdfs目录中.....{self.hdfs_path}")
HdfsUtils.delete_file_in_folder(self.hdfs_path)
print(f"当前存储的表名为:{self.hive_table},分区为{partition_by}")
df_save.write.saveAsTable(name=self.hive_table, format='hive', mode='append', partitionBy=partition_by)
print("success")
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
obj = DimAsinLabel(site_name, date_type, date_info)
obj.run()
import os
import sys
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
from pyspark.sql.types import StringType
# 分组排序的udf窗口函数
from pyspark.sql.window import Window
from pyspark.sql import functions as F
class DimAsinTitleInfo(Templates):
def __init__(self, site_name='us'):
super().__init__()
self.site_name = site_name
self.db_save_vertical = f'dim_asin_title_info_vertical'
self.db_save_wide = f'dim_asin_title_info_wide'
self.spark = self.create_spark_object(app_name=f"{self.db_save_vertical}: {self.site_name}, {self.date_type}, {self.date_info}")
self.df_theme = self.spark.sql(f"select 1+1;")
self.df_asin_title = self.spark.sql(f"select 1+1;")
self.df_save_vertical = self.spark.sql(f"select 1+1;") # 竖表
self.df_save_wide = self.spark.sql(f"select 1+1;") # 宽表
# 注册自定义函数 (UDF)
self.u_theme_pattern = F.udf(self.udf_theme_pattern, StringType())
# 其他变量
# self.pattern = str() # 正则匹配
self.theme_list_str = str() # 正则匹配
# 分区参数
self.partitions_by = ['site_name']
self.partitions_num = 100
@staticmethod
def udf_theme_pattern(title, theme_list_str):
found_themes = [theme.strip() for theme in eval(theme_list_str) if theme in title]
if found_themes:
return ','.join(set(found_themes))
else:
return None
def read_data(self):
sql = f"select id as theme_id, theme_type_en, theme_en, theme_en_lower, theme_ch from ods_theme where site_name='{self.site_name}'"
print("sql:", sql)
self.df_theme = self.spark.sql(sql).cache()
self.df_theme.show(10, truncate=False)
# sql = f"-- select asin, title as asin_volume, date_info from ods_asin_detail where site_name='{self.site_name}' and date_type='week'" # and date_info>='2023-15'
sql = f"select asin, title as asin_title, date_info, site_name from ods_asin_detail where site_name='{self.site_name}' and date_type='week' " # and date_info>='2023-25' limit 10000
print("sql:", sql)
self.df_asin_title = self.spark.sql(sqlQuery=sql).cache()
self.df_asin_title.show(10, truncate=False)
def handle_data(self):
self.handle_filter_dirty_data()
self.handle_theme()
def handle_filter_dirty_data(self):
"""
过滤脏数据+保留最新的title
"""
# 小写
self.df_asin_title = self.df_asin_title.withColumn("asin_title_lower", F.lower(self.df_asin_title["asin_title"]))
# 过滤空值
self.df_asin_title = self.df_asin_title.filter("asin_title_lower is not null")
# 过滤null和none字符串
self.df_asin_title = self.df_asin_title.filter("asin_title_lower not in ('none', 'null', 'nan')")
# 取最新的date_info对应的title
window = Window.partitionBy('asin').orderBy(F.desc('date_info')) # 按照 date_info 列进行分区,并按照 date 列进行排序
self.df_asin_title = self.df_asin_title.withColumn('row_number', F.row_number().over(window)) # 使用窗口函数为每个分区的行编号
self.df_asin_title = self.df_asin_title.filter(self.df_asin_title.row_number == 1).drop('row_number') # 只保留每个分区中 row_number 最大的行,并删除 row_number 列
def handle_theme(self):
pdf_theme = self.df_theme.toPandas()
theme_list = list(set(pdf_theme.theme_en_lower))
self.theme_list_str = str([f" {theme} " for theme in theme_list])
print("self.theme_list_str:", self.theme_list_str)
# 匹配宽表时用到
df_asin_title = self.df_asin_title.cache() # 后面用作匹配asin_title
self.df_asin_title = self.df_asin_title.withColumn("asin_title_lower", F.concat(F.lit(" "), "asin_title_lower", F.lit(" "))) # 标题两头加空字符串用来匹配整个词
self.df_asin_title = self.df_asin_title.withColumn("theme_en_lower", self.u_theme_pattern('asin_title_lower', F.lit(self.theme_list_str)))
# 将列拆分为数组多列
self.df_asin_title = self.df_asin_title.withColumn("theme_en_lower", F.split(self.df_asin_title["theme_en_lower"], ","))
# 将数组合并到多行
self.df_asin_title = self.df_asin_title.withColumn("theme_en_lower", F.explode(self.df_asin_title["theme_en_lower"]))
self.df_asin_title = self.df_asin_title.join(
self.df_theme, on=['theme_en_lower'], how='left' # 改成inner, 这样避免正则匹配结果不准
)
# 1. 竖表
self.df_save_vertical = self.df_asin_title.cache()
print(self.df_save_vertical.columns)
self.df_save_vertical.show(30, truncate=False)
# self.df_save_vertical.filter("theme_en_lower is not null").show(30, truncate=False)
# 2. 宽表
self.df_asin_title = self.df_asin_title.drop_duplicates(['asin', 'theme_type_en', 'theme_ch'])
self.df_asin_title = self.df_asin_title.withColumn("theme_type_en_counts", F.concat("theme_type_en", F.lit("_counts")))
self.df_asin_title = self.df_asin_title.withColumn("theme_type_en_ids", F.concat("theme_type_en", F.lit("_ids")))
# self.df_asin_title.filter('theme_type_en_counts is null').show(20, truncate=False) # 没有记录
self.df_asin_title = self.df_asin_title.filter('theme_type_en_counts is not null')
pivot_df1 = self.df_asin_title.groupBy("asin").pivot("theme_type_en_counts").agg(
F.expr("IFNULL(count(*), 0) AS value"))
pivot_df1 = pivot_df1.na.fill(0)
pivot_df2 = self.df_asin_title.groupBy("asin").pivot("theme_type_en_ids").agg(
F.concat_ws(",", F.collect_list("theme_id")))
pivot_df1.show(30, truncate=False)
pivot_df2.show(30, truncate=False)
self.df_save_wide = df_asin_title.join(
pivot_df1, on='asin', how='left'
).join(
pivot_df2, on='asin', how='left'
)
# self.df_save_wide.show(30, truncate=False)
print(self.df_save_wide.columns)
def save_data(self):
self.reset_partitions(partitions_num=100)
self.save_data_common(
df_save=self.df_save_vertical,
db_save=self.db_save_vertical,
partitions_num=self.partitions_num,
partitions_by=self.partitions_by
)
self.reset_partitions(partitions_num=100)
self.save_data_common(
df_save=self.df_save_wide,
db_save=self.db_save_wide,
partitions_num=self.partitions_num,
partitions_by=self.partitions_by
)
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
handle_obj = DimAsinTitleInfo(site_name=site_name)
handle_obj.run()
\ No newline at end of file
import os
import re
import sys
import time
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from pyspark.storagelevel import StorageLevel
from utils.templates import Templates
# from ..utils.templates import Templates
from pyspark.sql.types import StringType, IntegerType
# 分组排序的udf窗口函数
from pyspark.sql.window import Window
from pyspark.sql import functions as F
class DimAsinTitleInfo(Templates):
def __init__(self, site_name='us', date_type="month", date_info='2022-1'):
super().__init__()
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
self.db_save_vertical = f'dim_asin_title_info_vertical'
self.db_save_wide = f'dim_asin_title_info_wide'
self.spark = self.create_spark_object(app_name=f"{self.db_save_vertical}: {self.site_name}, {self.date_type}, {self.date_info}")
self.df_theme = self.spark.sql(f"select 1+1;")
self.df_asin_title = self.spark.sql(f"select 1+1;")
self.df_save_vertical = self.spark.sql(f"select 1+1;") # 竖表
self.df_save_wide = self.spark.sql(f"select 1+1;") # 宽表
# 注册自定义函数 (UDF)
self.u_theme_pattern = F.udf(self.udf_theme_pattern, StringType())
# 其他变量
self.pattern = str() # 正则匹配
# 分区参数
self.partitions_by = ['site_name']
self.partitions_num = 100
@staticmethod
def udf_theme_pattern(title, pattern):
results_list = re.findall(pattern, title) # , re.IGNORECASE,
if results_list:
unique_first_values = set() # 集合 -- 自带去重功能
for item in results_list:
theme = item[0].strip() # 去掉匹配的两头空格
unique_first_values.add(theme)
return ','.join(unique_first_values)
else:
return None
def read_data(self):
sql = f"select id, theme_type_en, theme_en, theme_en_lower, theme_ch from ods_theme where site_name='{self.site_name}'"
print("sql:", sql)
self.df_theme = self.spark.sql(sql).cache()
self.df_theme.show(10, truncate=False)
# sql = f"-- select asin, title as asin_volume, date_info from ods_asin_detail where site_name='{self.site_name}' and date_type='week'" # and date_info>='2023-15'
sql = f"select asin, title as asin_title, date_info, site_name from ods_asin_detail where site_name='{self.site_name}' and date_type='week' and date_info>='2023-25'" # and date_info>='2023-15'
print("sql:", sql)
self.df_asin_title = self.spark.sql(sqlQuery=sql).cache()
self.df_asin_title.show(10, truncate=False)
def handle_data(self):
self.handle_filter_dirty_data()
self.handle_theme()
def handle_filter_dirty_data(self):
"""
过滤脏数据+保留最新的title
"""
# 小写
self.df_asin_title = self.df_asin_title.withColumn("asin_title_lower", F.lower(self.df_asin_title["asin_title"]))
# 过滤空值
self.df_asin_title = self.df_asin_title.filter("asin_title_lower is not null")
# 过滤null和none字符串
self.df_asin_title = self.df_asin_title.filter("asin_title_lower not in ('none', 'null', 'nan')")
# 取最新的date_info对应的title
window = Window.partitionBy('asin').orderBy(F.desc('date_info')) # 按照 date_info 列进行分区,并按照 date 列进行排序
self.df_asin_title = self.df_asin_title.withColumn('row_number', F.row_number().over(window)) # 使用窗口函数为每个分区的行编号
self.df_asin_title = self.df_asin_title.filter(self.df_asin_title.row_number == 1).drop('row_number') # 只保留每个分区中 row_number 最大的行,并删除 row_number 列
def handle_theme(self):
pdf_theme = self.df_theme.toPandas()
pattern_list = list(set(pdf_theme.theme_en_lower))
# pattern_list = [f' {pattern} ' for pattern in pattern_list if pattern] # 去掉空字符串 -- 优化匹配
pattern_list = [re.escape(f' {pattern} ') for pattern in pattern_list if pattern] # 去掉空字符串 -- 优化匹配
pattern_list.sort(key=len, reverse=True) # 根据长度进行排序
pattern_str = '|'.join(pattern_list) # | 或匹配
# pattern_str = re.escape(pattern_str)
self.pattern = '(?=(' + pattern_str + '))' # 正则匹配模式
# self.pattern = re.escape(self.pattern)
print("self.pattern:", self.pattern)
df_asin_title = self.df_asin_title.cache() # 后面用作匹配asin_title
self.df_asin_title = self.df_asin_title.withColumn("asin_title_lower", F.concat(F.lit(" "), "asin_title_lower", F.lit(" "))) # 标题两头加空字符串用来匹配整个词
self.df_asin_title = self.df_asin_title.withColumn("theme_en_lower", self.u_theme_pattern('asin_title_lower', F.lit(self.pattern)))
# 将列拆分为数组多列
self.df_asin_title = self.df_asin_title.withColumn("theme_en_lower", F.split(self.df_asin_title["theme_en_lower"], ","))
# 将数组合并到多行
self.df_asin_title = self.df_asin_title.withColumn("theme_en_lower", F.explode(self.df_asin_title["theme_en_lower"]))
# self.df_asin_title.show(100, truncate=False)
# self.df_asin_title.filter("asin='0060574437'").show(100, truncate=False)
# self.df_asin_title.filter('theme_en_lower is null').show(20, truncate=False) # 没有记录
self.df_asin_title = self.df_asin_title.join(
self.df_theme, on=['theme_en_lower'], how='left' # 改成inner, 这样避免正则匹配结果不准
)
self.df_save_vertical = self.df_asin_title
self.df_save_vertical.show(30, truncate=False)
self.df_asin_title = self.df_asin_title.drop_duplicates(['asin', 'theme_type_en', 'theme_ch'])
# self.df_asin_title.filter('theme_en_lower is null').show(20, truncate=False) # 没有记录
# self.df_save.show(30, truncate=False)
self.df_asin_title = self.df_asin_title.withColumn("theme_type_en_counts", F.concat("theme_type_en", F.lit("_counts")))
self.df_asin_title = self.df_asin_title.withColumn("theme_type_en_ids", F.concat("theme_type_en", F.lit("_ids")))
# self.df_asin_title.filter('theme_type_en_counts is null').show(20, truncate=False) # 没有记录
self.df_asin_title = self.df_asin_title.filter('theme_type_en_counts is not null')
pivot_df1 = self.df_asin_title.groupBy("asin").pivot("theme_type_en_counts").agg(F.expr("IFNULL(count(*), 0) AS value"))
pivot_df1 = pivot_df1.na.fill(0)
pivot_df2 = self.df_asin_title.groupBy("asin").pivot("theme_type_en_ids").agg(F.concat_ws(",", F.collect_list("id")))
pivot_df1.show(30, truncate=False)
# pivot_df2.show(30, truncate=False)
self.df_save_wide = df_asin_title.join(
pivot_df1, on='asin', how='left'
).join(
pivot_df2, on='asin', how='left'
)
# self.df_save_wide.show(30, truncate=False)
print(self.df_save_wide.columns)
def save_data(self):
self.reset_partitions(partitions_num=50)
self.save_data_common(
df_save=self.df_save_vertical,
db_save=self.db_save_vertical,
partitions_num=self.partitions_num,
partitions_by=self.partitions_by
)
self.reset_partitions(partitions_num=100)
self.save_data_common(
df_save=self.df_save_wide,
db_save=self.db_save_wide,
partitions_num=self.partitions_num,
partitions_by=self.partitions_by
)
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
date_type = sys.argv[2] # 参数2:类型:day/week/4_week/month/quarter
date_info = sys.argv[3] # 参数3:年-月-日/年-周/年-月/年-季, 比如: 2022-1
handle_obj = DimAsinTitleInfo(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment