Commit 42d87c58 by wangjing
parents ce835ee7 dd03cbe7
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import col
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("ABA_2023_10_12_export")
sql1 = """
select
date_info,
search_term,
st_bsr_cate_1_id_new as category_id,
market_cycle_type,
is_first_text,
is_ascending_text,
is_high_return_text,
is_search_text,
st_movie_label,
st_brand_label,
bsr_orders,
st_word_num,
st_num,
rank
from dwt_aba_st_analytics
where site_name = 'us'
and date_type = 'month'
and date_info in ('2023-10','2023-11','2023-12');
"""
df_dwt_aba_st_analytics = spark.sql(sql1).cache()
sql2 = """
select
category_id,
en_name
from dim_bsr_category_tree
where site_name = 'us'
and category_parent_id = 0;
"""
df_dim_bsr_category_tree = spark.sql(sql2).cache()
sql3 = """
select
search_term,
rank_change_rate,
rank_rate_of_change,
date_info
from dwt_aba_last_change_rate
where site_name = 'us'
and date_type = 'month'
and date_info in ('2023-10','2023-11','2023-12');
"""
df_dwt_aba_last_change_rate = spark.sql(sql3).cache()
# 过滤出满足条件的词
df_dwt_aba_st_analytics = df_dwt_aba_st_analytics.filter(
"(is_first_text = 1) or (is_ascending_text = 1) or (market_cycle_type in (1, 2))"
)
df_save = df_dwt_aba_st_analytics.join(
df_dim_bsr_category_tree, on='category_id', how='left'
).join(
df_dwt_aba_last_change_rate, on=['date_info', 'search_term'], how='left'
)
df_save = df_save.select(
col('date_info').alias('year_month'),
col('search_term'),
col('en_name').alias('category'),
col('market_cycle_type'),
col('is_first_text'),
col('is_ascending_text'),
col('is_high_return_text'),
col('is_search_text'),
col('st_movie_label').alias('movie_label'),
col('st_brand_label').alias('brand_label'),
col('bsr_orders'),
col('st_word_num').alias('word_counts'),
col('st_num').alias('word_frequency'),
col('rank'),
col('rank_change_rate').alias('year_on_year'),
col('rank_rate_of_change').alias('month_on_month')
)
df_save.repartition(5).show(10, truncate=True)
df_save.write.saveAsTable(name='tmp_aba_2023_export', format='hive', mode='append')
spark.stop()
import os
import re
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql.functions import count, explode, split, udf, lit
from pyspark.sql.types import ArrayType, StringType
if __name__ == '__main__':
date_info = CommonUtil.get_sys_arg(1, None)
spark = SparkUtil.get_spark_session("ABA_2023_10_12_word_frequency")
# 自定义函数,将词组拆分为2个单词为一组
def split_tow_by_tow(search_term):
words = search_term.split()
pairs = []
for i in range(len(words) - 1):
pairs.append(words[i] + ' ' + words[i + 1])
return pairs
u_split_tow_by_tow = udf(split_tow_by_tow, ArrayType(StringType()))
# 自定义函数,将词组拆分为3个单词为一组
def split_three_by_three(search_term):
words = search_term.split()
triplets = []
for i in range(len(words) - 2):
triplets.append(words[i] + ' ' + words[i + 1] + ' ' + words[i + 2])
return triplets
u_split_three_by_three = udf(split_three_by_three, ArrayType(StringType()))
# 自定义函数,剔除掉多余字符
def characters_to_remove(search_term):
pattern = r'\s[^\w\s%\']+?\s'
cleaned_text = re.sub(pattern, ' ', search_term)
cleaned_text = cleaned_text.replace('\n', ' ')
return cleaned_text
u_characters_to_remove = udf(characters_to_remove, StringType())
sql = f"""
select
search_term
from dwt_aba_st_analytics
where site_name = 'us'
and date_type = 'month'
and date_info = '{date_info}';
"""
df_aba = spark.sql(sql).cache()
df_aba = df_aba.select(
u_characters_to_remove(df_aba['search_term']).alias('search_term')
)
df_one_word = df_aba.select(
explode(split(df_aba['search_term'], ' ')).alias('word')
).groupby(
['word']
).agg(
count('word').alias('word_frequency')
).filter(
'word_frequency >= 50'
).withColumn(
'date_info',
lit(f'{date_info}-1')
)
df_tow_word = df_aba.select(
explode(u_split_tow_by_tow(df_aba['search_term'])).alias('word')
).groupby(
['word']
).agg(
count('word').alias('word_frequency')
).filter(
'word_frequency >= 50'
).withColumn(
'date_info',
lit(f'{date_info}-2')
)
df_three_word = df_aba.select(
explode(u_split_three_by_three(df_aba['search_term'])).alias('word')
).groupby(
['word']
).agg(
count('word').alias('word_frequency')
).filter(
'word_frequency >= 50'
).withColumn(
'date_info',
lit(f'{date_info}-3')
)
df_one_word.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
df_tow_word.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
df_three_word.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import count, explode, lit, desc, sum
from pyspark.sql.types import ArrayType, StringType
from textblob import Word
from googletrans import Translator
class ABA2023YearWordFrequency(object):
def __init__(self):
self.spark = SparkUtil.get_spark_session("spark_task: aba_2023_year_word_frequency")
self.df_aba_2023 = self.spark.sql(f"select 1+1;")
self.df_beside_category = self.spark.sql(f"select 1+1;")
self.df_translate = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
self.df_save1 = self.spark.sql(f"select 1+1;")
self.df_save2 = self.spark.sql(f"select 1+1;")
self.df_agg = self.spark.sql(f"select 1+1;")
# 自定义udf
self.u_get_singular_form = self.spark.udf.register('get_singular_form', self.get_singular_form, StringType())
self.u_word_tokenize = self.spark.udf.register('word_tokenize', self.word_tokenize, ArrayType(StringType()))
# self.u_word_translate = self.spark.udf.register('word_translate', self.word_translate, StringType())
@staticmethod
def get_singular_form(word: str):
"""
将单词全部转化为单数形式
"""
if word:
singular_form = Word(word).lemmatize("n")
# word_object = Word(word)
# singular_form = word_object.singularize()
return singular_form
return word
@staticmethod
def word_tokenize(title: str):
"""
分词器
"""
from nltk.tokenize import word_tokenize
result = word_tokenize(title, "english")
return result
# @staticmethod
# def word_translate(word: str):
# if word:
# try:
# translator = Translator()
# result = translator.translate(word, src='en', dest='zh-cn')
# return result.text
# except Exception as e:
# # 处理其他未知错误
# print(f"An unexpected error occurred: {e}")
# return None
# return None
def read_data(self):
sql1 = f"""
select
search_term,
category_id
from dwt_aba_last365
where site_name = 'us'
and date_type = 'last365day'
and date_info = '2023-12';
"""
self.df_aba_2023 = self.spark.sql(sql1).cache()
print("df_aba_2023的数量:")
print(self.df_aba_2023.count())
sql2 = f"""
select
category_id
from dim_bsr_category_tree
where site_name = 'us'
and en_name in ('Audible Books & Originals', 'Books', 'Kindle Store', 'Apps & Games', 'Movies & TV', 'CDs & Vinyl', 'Software', 'Video Games')
and category_parent_id = 0;
"""
self.df_beside_category = self.spark.sql(sql2).cache()
print("df_beside_category的数量:")
print(self.df_beside_category.count())
sql3 = f"""
select
word,
simple_cn as cn
from tmp_en_dict;
"""
self.df_translate = self.spark.sql(sql3).cache()
print("df_translate的数量:")
print(self.df_translate.count())
def handle_data(self):
self.df_save = self.df_aba_2023.join(
self.df_beside_category, on='category_id', how='left_anti'
).select('search_term')
self.df_save = self.df_save.select(explode(self.u_word_tokenize(self.df_save['search_term'])).alias('word'))
self.df_save = self.df_save.groupby(['word']).agg(
count('word').alias('word_frequency')
)
self.df_save = self.df_save.join(
self.df_translate, on='word', how='left'
).withColumn(
'word_singular_form',
self.u_get_singular_form(self.df_save['word'])
).cache()
self.df_save1 = self.df_save.select(
'word', 'word_frequency', 'cn'
).orderBy(
desc('word_frequency')
).withColumn(
'date_info',
lit('2023')
)
print("df_save1的数量:")
print(self.df_save1.count())
self.df_save1.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
print("df_save1存储完成!")
self.df_agg = self.df_save.groupby(['word_singular_form']).agg(
sum('word_frequency').alias('word_frequency')
)
self.df_save2 = self.df_save.select('word', 'cn', 'word_singular_form').join(
self.df_agg, on='word_singular_form', how='left'
).select(
'word', 'word_frequency', 'cn'
).orderBy(
desc('word_frequency')
).withColumn(
'date_info',
lit('2023-merge')
)
print("df_save2的数量:")
print(self.df_save2.count())
self.df_save2.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
print("df_save2存储完成!")
if __name__ == '__main__':
obj = ABA2023YearWordFrequency()
obj.read_data()
obj.handle_data()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
if __name__ == '__main__':
start_date = CommonUtil.get_sys_arg(1, None)
end_date = CommonUtil.get_sys_arg(2, None)
spark_session = SparkUtil.get_spark_session("re-run-aba-month")
sql = f"""
select distinct year_month as date_info from dim_date_20_to_30 where year_month >= '{start_date}' and year_month < '{end_date}';
"""
date_df = spark_session.sql(sql)
print(date_df.show())
date_list = sorted([d.asDict().get("date_info") for d in date_df.collect()])
print(date_list)
for date_info in date_list:
startParams = {
"site_name": "us",
"date_type": "month",
"date_info": date_info
}
print(startParams)
DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='月-重跑ABA四分位',
startParams={
"site_name": "us",
"date_type": "month",
"date_info": date_info
}
)
CommonUtil.send_wx_msg(["huangjian", "chenyuanjie"], "【月-重跑ABA四分位】重跑完成", "")
pass
def asin_to_number(asin):
"""
Convert a 10-character ASIN string to a unique number.
This function assumes that ASIN consists of uppercase letters and digits.
"""
def char_to_number(char):
if char.isdigit():
return int(char)
else:
return ord(char) - 55 # 'A' -> 10, 'B' -> 11, ..., 'Z' -> 35
if len(asin) != 10:
raise ValueError("ASIN must be 10 characters long")
base = 36
asin_number = 0
for i, char in enumerate(reversed(asin)):
asin_number += char_to_number(char) * (base ** i)
# The final number is taken modulo 1 billion to fit the range 1-10 billion
return asin_number % 1000000000
if __name__ == '__main__':
x = asin_to_number('B0CGY4LZQ3')
print(x)
s = f'us_asin_image_part{int(x / 1000_0000) + 1}'
print(s)
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from pyspark.sql.window import Window
from pyspark.storagelevel import StorageLevel
from pyspark.sql import functions as F
class DwtMerchantwordsStDetailMerge(Templates):
def __init__(self, site_name='us'):
super().__init__()
self.site_name = site_name
self.batch = '2024-1'
self.db_save = 'dwt_merchantwords_st_detail_merge'
self.spark = self.create_spark_object(
app_name=f"DwtMerchantwordsStDetailMerge: {self.site_name}, {self.batch}")
self.partitions_num = 15
self.partitions_by = ['site_name', 'batch']
self.df = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwt/{self.db_save}/site_name={self.site_name}/batch={self.batch}"
print(f"清除hdfs目录中.....{hdfs_path}")
HdfsUtils.delete_hdfs_file(hdfs_path)
def read_data(self):
print("读取dwt_merchantwords_st_detail数据")
sql = f"""
select
keyword,
volume,
avg_3m,
avg_12m,
depth,
results_count,
sponsored_ads_count,
page_1_reviews,
appearance,
last_seen,
update_time,
lang,
batch as last_batch
from dwt_merchantwords_st_detail
where site_name = '{self.site_name}'
and batch in ('2023-1', '2024-1');
"""
self.df = self.spark.sql(sqlQuery=sql)
self.df = self.df.repartition(80).persist(StorageLevel.MEMORY_ONLY)
self.df.show(10, truncate=True)
def handle_data(self):
window = Window.partitionBy('keyword').orderBy(
F.desc_nulls_last('last_batch')
)
self.df = self.df.withColumn("u_rank", F.row_number().over(window=window))
self.df = self.df.filter('u_rank=1').drop('u_rank')
self.df_save = self.df.withColumn(
'site_name',
F.lit(self.site_name)
).withColumn(
'batch',
F.lit(self.batch)
)
if __name__ == '__main__':
site_name = sys.argv[1]
handle_obj = DwtMerchantwordsStDetailMerge(site_name=site_name)
handle_obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.hdfs_utils import HdfsUtils
from utils.common_util import CommonUtil
from utils.templates import Templates
from pyspark.sql import functions as F
class FlowAsinLast30days(Templates):
def __init__(self):
super().__init__()
self.db_save = "tmp_flow_asin_last30days"
self.spark = self.create_spark_object(app_name="FlowAsinLast30days")
self.partitions_num = 20
self.partition_dict = {}
self.df_es = self.spark.sql(f"select 1+1;")
self.df_parent = self.spark.sql(f"select 1+1;")
self.df_joined = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
def read_data(self):
self.df_es = self.spark.read.format("org.elasticsearch.spark.sql")\
.option("es.nodes", "192.168.10.217")\
.option("es.port", "9200")\
.option("es.net.http.auth.user", "elastic")\
.option("es.net.http.auth.pass", "selection2021.+")\
.option("es.resource", "us_st_detail_last_4_week")\
.option("es.query", '{"query": {"match_all": {}}}')\
.load()
columns = ["asin", "first_category_rank", "asin_bought_month", "total_comments", "variation_num", "site_name", "account_name"]
self.df_es = self.df_es.select(columns).cache()
self.df_es.show()
sql = f"""
select
asin,
parent_asin
from
ods_asin_variat;
"""
self.df_parent = self.spark.sql(sqlQuery=sql).cache()
def handle_data(self):
# self.df_parent = self.df_parent.groupby(["parent_asin"]).agg(F.count("asin").alias("variation_num"))
self.df_joined = self.df_es.join(self.df_parent, "asin", "left")
self.df_joined = self.df_joined\
.withColumn("parent_asin_is_null", F.when(F.col("parent_asin").isNull(), F.lit(1)).otherwise(F.lit(0)))\
.withColumn("parent_asin_exist", F.when(F.col("parent_asin").isNotNull(), F.lit(1)).otherwise(F.lit(0)))
def save_data(self):
self.df_save = self.df_joined
hdfs_path_asin_info = CommonUtil.build_hdfs_path(self.db_save, partition_dict=self.partition_dict)
print(f"清除hdfs目录中:{hdfs_path_asin_info}")
HdfsUtils.delete_file_in_folder(hdfs_path_asin_info)
print(f"当前存储的表名为:{self.db_save}")
self.df_save.write.saveAsTable(name=self.db_save, format='hive', mode='append')
print("success")
if __name__ == '__main__':
obj = FlowAsinLast30days()
obj.run()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
if __name__ == '__main__':
date_list = ["2024-02","2024-01","2023-12","2023-11","2023-10","2023-09"]
for date_info in date_list:
startParams = {
"site_name": "us",
"date_type": "month",
"date_info": date_info
}
print(startParams)
DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='export_dwt_flow_asin_api',
startParams=startParams
)
CommonUtil.send_wx_msg(["chenyuanjie", "wangrui4"], "【export_dwt_flow_asin_api】导出完成", "")
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
from datetime import datetime
if __name__ == '__main__':
date_info = CommonUtil.get_sys_arg(1, None)
n = CommonUtil.get_sys_arg(2, 0)
hive_tb = "dwt_merchantwords_st_detail"
export_tb = "us_merchantwords_search_term_month_syn_2024"
spark = SparkUtil.get_spark_session(f"export: {hive_tb}")
# 一次导出400w条数据
batch_size = (int(n)-1) * 4000000
start_index = 1 + batch_size
end_index = 4000000 + batch_size
# 构建 URL 的函数
def build_urls(search_term):
url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return urls
# 将Python函数转换为UDF
spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
# 从 PostgreSQL 数据库中读取已有数据
# df_pg = spark.read.format("jdbc") \
# .option("url", "jdbc:postgresql://192.168.10.225:5432/selection") \
# .option("dbtable", export_tb) \
# .option("user", "yswg_postgres") \
# .option("password", "yswg_postgres") \
# .load()
# df_pg = df_pg\
# .select("search_term") \
# .drop_duplicates(["search_term"]) \
# .repartition(70) \
# .cache()
# 从 Hive 表中读取数据
df_hive = spark.sql(f"SELECT keyword FROM {hive_tb}")
df_hive = df_hive\
.withColumn("row_num", row_number().over(Window.orderBy("keyword")))\
.filter(f"row_num BETWEEN {start_index} AND {end_index}")\
.select("keyword")\
.repartition(10) \
.cache()
# 过滤掉keyword含有中文的数据
df_hive = df_hive.filter(~df_hive["keyword"].rlike("[\u4e00-\u9fff]"))
# 过滤掉已存在于目标数据库中的数据
# df_hive = df_hive.join(df_pg, df_hive["keyword"] == df_pg["search_term"], "leftanti")
# 如果没有数据需要导出,退出循环
if df_hive.count() == 0:
print("-------数据已全部导出!-------")
quit()
df_hive = df_hive.selectExpr("keyword AS search_term")
df_hive = df_hive.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
df_hive = df_hive.withColumn("date_info", lit(date_info))
# 导出数据到 PostgreSQL 数据库
df_hive.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.225:5432/selection") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql.functions import row_number, lit, length
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
if __name__ == '__main__':
date_info = CommonUtil.get_sys_arg(1, None)
n = CommonUtil.get_sys_arg(2, 0)
import_tb = "search_term_result_year"
export_tb = "us_merchantwords_search_term_month_syn_2024"
spark = SparkUtil.get_spark_session("MerchantwordsSRToPG16")
# 一次导出400w条数据
batch_size = (int(n)-1) * 4000000
start_index = 1 + batch_size
end_index = 4000000 + batch_size
# 构建 URL 的函数
def build_urls(search_term):
url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return urls
# 将Python函数转换为UDF
spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
# 从SR数据库中读取已有数据
df = spark.read.format("jdbc") \
.option("url", "jdbc:mysql://192.168.10.151:19030/test") \
.option("dbtable", import_tb) \
.option("user", "chenyuanjie") \
.option("password", "chenyuanjie12345") \
.load()
df = df.withColumn(
"row_num",
row_number().over(Window.orderBy("search_term"))
).filter(f"row_num BETWEEN {start_index} AND {end_index}").repartition(20).cache()
# 过滤掉keyword含有中文的数据
df = df.filter(~df["search_term"].rlike("[\u4e00-\u9fff]"))
# 如果没有数据需要导出,退出循环
if df.count() == 0:
print("-------数据已全部导出!-------")
quit()
df = df.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
df = df.filter(length(df['url']) <= 450)
df = df.withColumn("date_info", lit(date_info))
# 导出数据到 PostgreSQL 数据库
df.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.225:5432/selection") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from utils.StarRocksHelper import StarRocksHelper
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("ods_asin_detail_sr_to_hive")
partition_dict = {
"site_name": 'us',
"date_type": 'month',
"date_info": '2024-03'
}
hdfs_path = CommonUtil.build_hdfs_path('ods_asin_detail_test', partition_dict=partition_dict)
HdfsUtils.delete_hdfs_file(hdfs_path)
connection_info = StarRocksHelper.get_connection_info('selection')
df_sr = spark.read.format("starrocks") \
.option("starrocks.fe.http.url", f"{connection_info['ip']}:{connection_info['http_port']}") \
.option("starrocks.fe.jdbc.url", f"jdbc:mysql://{connection_info['ip']}:{connection_info['jdbc_port']}") \
.option("starrocks.table.identifier", "test.ods_asin_detail_test2") \
.option("starrocks.user", connection_info['user']) \
.option("starrocks.password", connection_info['pwd']) \
.option("starrocks.request.tablet.size", "1") \
.option("starrocks.batch.size", "40960") \
.option("starrocks.exec.mem.limit", "21474836480") \
.load()
print("读取完毕")
df_sr.repartition(50)
partitions_by = ['site_name', 'date_type', 'date_info']
df_sr.write.saveAsTable(name='ods_asin_detail_test', format='hive', mode='append', partitionBy=partitions_by)
spark.stop()
# 创建lzo索引和修复元数据
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb='ods_asin_detail_test')
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from utils.StarRocksHelper import StarRocksHelper
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("ods_asin_detail_to_sr_test")
sql = """
select
*
from ods_asin_detail
where site_name = 'us'
and date_type = 'month'
and date_info = '2024-03'
"""
df_hive = spark.sql(sql).repartition(40)
connection_info = StarRocksHelper.get_connection_info('selection')
df_hive.write.format("starrocks") \
.option("starrocks.fe.http.url", f"{connection_info['ip']}:{connection_info['http_port']}") \
.option("starrocks.fe.jdbc.url", f"jdbc:mysql://{connection_info['ip']}:{connection_info['jdbc_port']}") \
.option("starrocks.table.identifier", "test.ods_asin_detail_test") \
.option("starrocks.user", connection_info['user']) \
.option("starrocks.password", connection_info['pwd']) \
.option("starrocks.write.flush.interval.ms", "10000") \
.option("starrocks.write.properties.column_separator", "~!@#$%^&*~!@#$%^&*") \
.mode("append") \
.save()
print("导出完毕")
spark.stop()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
if __name__ == '__main__':
date_list = ['2022-02', '2022-03', '2022-04', '2022-05', '2022-06',
'2022-07', '2022-08', '2022-09', '2022-10', '2022-11', '2022-12']
for date_info in date_list:
print(f"当前执行的分区为:{date_info}")
success_flag = DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='ABA品牌标签调整重跑_api',
startParams={
"site_name": "us",
"date_type": "month",
"date_info": date_info,
"wx_user": "chenyuanjie"
}
)
if success_flag:
continue
else:
CommonUtil.send_wx_msg(["chenyuanjie"], f"ABA品牌标签调整重跑_api {date_info} 执行失败")
break
CommonUtil.send_wx_msg(["chenyuanjie"], "ABA品牌标签调整重跑_api 2022年 执行结束")
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
if __name__ == '__main__':
date_list = ['2023-11', '2023-12']
for date_info in date_list:
print(f"当前执行的分区为:{date_info}")
success_flag = DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='ABA品牌标签调整重跑_api',
startParams={
"site_name": "us",
"date_type": "month",
"date_info": date_info,
"wx_user": "chenyuanjie"
}
)
if success_flag:
continue
else:
CommonUtil.send_wx_msg(["chenyuanjie"], f"ABA品牌标签调整重跑_api {date_info} 执行失败")
break
CommonUtil.send_wx_msg(["chenyuanjie"], "ABA品牌标签调整重跑_api 2023年 执行结束")
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.DolphinschedulerHelper import DolphinschedulerHelper
from utils.common_util import CommonUtil
if __name__ == '__main__':
date_list = ['2024-01', '2024-02', '2024-03', '2024-04',
'2024-05', '2024-06', '2024-07', '2024-08']
for date_info in date_list:
print(f"当前执行的分区为:{date_info}")
success_flag = DolphinschedulerHelper.start_and_watch_process_instance(
"big_data_selection",
process_df_name='ABA品牌标签调整重跑_api',
startParams={
"site_name": "us",
"date_type": "month",
"date_info": date_info,
"wx_user": "chenyuanjie"
}
)
if success_flag:
continue
else:
CommonUtil.send_wx_msg(["chenyuanjie"], f"ABA品牌标签调整重跑_api {date_info} 执行失败")
break
CommonUtil.send_wx_msg(["chenyuanjie"], "ABA品牌标签调整重跑_api 2024年 执行结束")
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
from datetime import datetime
if __name__ == '__main__':
date_info = CommonUtil.get_sys_arg(1, None)
year, month, day = date_info.split("-")
table = f"us_merchantwords_brand_analytics_2024_{month}_{day}"
spark = SparkUtil.get_spark_session(f"us_merchantwords_brand_analytics_2024:pg2pg,{date_info}")
df = spark.read.format("jdbc") \
.option("url", "jdbc:postgresql://113.100.143.162:5432/selection") \
.option("dbtable", table) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.load()
df.write.format("jdbc") \
.option("url", "jdbc:postgresql://113.100.143.162:5443/selection") \
.option("dbtable", table) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import lit, col
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
if __name__ == '__main__':
export_tb = "de_merchantwords_search_term_month_syn_2024"
spark = SparkUtil.get_spark_session("MerchantwordsSupplement")
# 构建 URL 的函数
def build_urls(search_term):
url_template = f"https://www.amazon.de/s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return urls
# 将Python函数转换为UDF
spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
sql1 = """
select
keyword,
volume,
st_monthly_sales,
greatest(results_count, asin_total_num) as asin_total_num,
st_sp_counts,
st_zr_counts
from dwt_merchantwords_merge
where site_name = 'de'
and batch = '2024-07-01'
"""
df_dwt_merchantwords_merge = spark.sql(sql1)
# sql2 = """
# select
# keyword
# from dwt_merchantwords_st_detail
# where site_name = 'de'
# and batch = '2024-1'
# """
# df_dwt_merchantwords_st_detail = spark.sql(sql2)
# 产品总数大于80且没有月销
df1 = df_dwt_merchantwords_merge.filter('asin_total_num > 80 and st_monthly_sales <= 0').select('keyword')
print("产品总数大于80且没有月销:" + str(df1.count()))
# 搜索量较大且没有sp广告词
df2 = df_dwt_merchantwords_merge.filter('volume >= 1 and st_sp_counts <= 0').select('keyword')
print("搜索量较大且没有sp广告词:" + str(df2.count()))
# 自然词总数 <= 0的部分
df3 = df_dwt_merchantwords_merge.filter('st_zr_counts <= 0').select('keyword')
print("自然词总数 <= 0的部分:" + str(df3.count()))
# # 过滤掉keyword含有中文的数据
# df_hive = df_hive.filter(~df_hive["keyword"].rlike("[\u4e00-\u9fff]"))
df_save = df1.union(df2).union(df3).drop_duplicates(['keyword'])
df_save = df_save.selectExpr("keyword AS search_term")
df_save = df_save.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
df_save = df_save.withColumn("date_info", lit('2024-06-26'))
# 导出数据到 PostgreSQL 数据库
df_save.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.225:5433/selection_de") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import lit, col
from pyspark.sql.types import StringType, ArrayType
from urllib.parse import quote
if __name__ == '__main__':
export_tb = "us_merchantwords_search_term_month_syn_2024"
spark = SparkUtil.get_spark_session("MerchantwordsSupplement")
# 构建 URL 的函数
def build_urls(search_term):
url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return urls
# 将Python函数转换为UDF
spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
sql1 = """
select
keyword,
volume,
st_zr_counts,
st_sp_counts
from dwt_merchantwords_merge
where site_name = 'us'
and batch = '2024-07-01'
"""
df_dwt_merchantwords_merge = spark.sql(sql1)
# 搜索量较大且没有sp广告词
df1 = df_dwt_merchantwords_merge.filter('volume >= 1 and st_sp_counts <= 0').select('keyword')
print("搜索量较大且没有sp广告词:" + str(df1.count()))
# 自然词总数 <= 0的部分
df2 = df_dwt_merchantwords_merge.filter('st_zr_counts <= 0').select('keyword')
print("自然词总数 <= 0的部分:" + str(df2.count()))
# # 过滤掉keyword含有中文的数据
# df_hive = df_hive.filter(~df_hive["keyword"].rlike("[\u4e00-\u9fff]"))
df_save = df1.union(df2).drop_duplicates(['keyword'])
df_save = df_save.selectExpr("keyword AS search_term")
df_save = df_save.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
df_save = df_save.withColumn("date_info", lit('2024-06-26'))
# 导出数据到 PostgreSQL 数据库
df_save.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.225:5433/selection") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.DorisHelper import DorisHelper
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
if __name__ == '__main__':
spark = SparkUtil.get_spark_session('aba_to_doris_test')
sql = f"""
select *
from dwt_aba_last365
where site_name = 'us'
and date_type = 'month'
and date_info = '2024-10';
"""
df_aba = spark.sql(sql).drop('site_name', 'date_type').cache()
df_aba = df_aba.withColumn(
'date_info', F.concat(F.regexp_replace('date_info', '-', ''), F.lit('01'))
)
df_aba.show(10, True)
columns = df_aba.columns
columns_str = ",".join(columns)
DorisHelper.spark_export_with_columns(df_aba, 'test', 'dwt_aba_last365', columns_str)
print('导出完成')
from openai import OpenAI
api_key = "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"
client = OpenAI(api_key=api_key)
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "Write a haiku about recursion in programming."
}
]
)
print(completion.choices[0].message)
import requests
response = requests.post(
f"https://api.stability.ai/v2beta/stable-image/generate/ultra",
headers={
"authorization": f"sk-f2iOAkResIloOY3yE6xk2LlQbVrtQi3EczZDjA3n9ns7bmeR",
"accept": "image/*"
},
files={"none": ''},
data={
"prompt": "A little cat is in a bedroom with a bed, TV, and sofa",
"output_format": "webp",
},
)
if response.status_code == 200:
with open("./cat01.webp", 'wb') as file:
file.write(response.content)
else:
raise Exception(str(response.json()))
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, BooleanType, MapType
"""
merchantwords 搜索词分词词频
"""
def is_number(str):
"""
判断一个字符是否是数字
:param str:
:return:
"""
import re
return re.match(r"^-?\d+\.?\d+$", str) is not None
def word_tokenize(keyword: str):
import re
keyword = re.sub(r'(\d+\.?\d*|-|\"|,|,|?|\?|/|、|)', '', keyword).strip()
from nltk.tokenize import word_tokenize
result = word_tokenize(keyword, "english")
# 过滤标点如下
filter_arr = [
" ", "\t", "\r", "\n", "(", ")", ",", ",", "[", "]", "、", "-", ":", "&", "|", "+", "``", "'", "'", "\""
]
return list(filter(lambda x: not is_number(x) and x not in filter_arr, result))
def run():
spark = SparkUtil.get_spark_session("app_name")
udf_word_tokenize = F.udf(word_tokenize, ArrayType(StringType()))
keywords_all = spark.sql("select keyword from dwt_merchantwords_st_detail where site_name='us'").cache()
df_all = keywords_all.withColumn("word", F.explode(udf_word_tokenize(F.col("keyword"))))
df_all = df_all.groupby(F.col("word")) \
.agg(F.count("word").alias("frequency")) \
.orderBy(F.col("frequency").desc()) \
.select(
F.col("word"),
F.col("frequency"),
F.lit("us").alias("site_name")
)
hive_tb = 'tmp_word_frequency'
# # 去重
partition_dict = {
"site_name": "us"
}
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict)
HdfsUtils.delete_hdfs_file(hdfs_path)
partition_by = list(partition_dict.keys())
print(f"当前存储的表名为:{hive_tb},分区为{partition_by}", )
df_all.write.saveAsTable(name=hive_tb, format='hive', mode='append', partitionBy=partition_by)
def word_pluralize(keyword: str):
from textblob import Word
# 单数形式
singularize = Word(keyword).singularize().string
# 复数形式
pluralize = Word(singularize).pluralize().string
result = {
"text": keyword,
"singularize": singularize,
"pluralize": pluralize,
"pluralizeFlag": keyword == pluralize,
"not_regular": keyword not in [singularize, pluralize]
}
return result
def word_stem(keyword: str):
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=False)
return stemmer.stem(keyword)
def word_test():
spark = SparkUtil.get_spark_session("word_test")
udf_word_pluralize = F.udf(word_pluralize, StructType(
[
StructField('text', StringType(), True),
StructField('singularize', StringType(), True),
StructField('pluralize', StringType(), True),
StructField('pluralizeFlag', BooleanType(), True),
StructField('not_regular', BooleanType(), True),
]
))
udf_word_stem = F.udf(word_stem, StringType())
keywords_all = spark.sql("select word,frequency from tmp_word_frequency").cache()
keywords_all = keywords_all.withColumn("resultMap", udf_word_pluralize(F.col("word"))).select(
F.col("word"),
F.col("frequency"),
F.col("resultMap").getField("singularize").alias("singularize"),
F.col("resultMap").getField("pluralize").alias("pluralize"),
F.col("resultMap").getField("pluralizeFlag").alias("pluralizeFlag"),
F.col("resultMap").getField("not_regular").alias("not_regular"),
).where("(pluralizeFlag == true) or (not_regular == true)")
# 计算词根
keywords_all = keywords_all.withColumn("word_stem", udf_word_stem(F.col("word")))
keywords_all = keywords_all.withColumn("singularize_stem", udf_word_stem(F.col("singularize")))
keywords_all = keywords_all.withColumn("pluralize_stem", udf_word_stem(F.col("pluralize")))
hive_tb = 'tmp_word_not_regular_v2'
keywords_all.write.saveAsTable(name=hive_tb, format='hive', mode='append')
print("success")
def word_for_download():
spark = SparkUtil.get_spark_session("word_for_calc")
keywords_all = spark.sql("""
select word
from tmp_for_market
order by volume desc
""")
CommonUtil.df_export_csv(spark, keywords_all, csv_name='word_for_calc', limit=200 * 10000)
print("success")
pass
if __name__ == '__main__':
# word_for_calc()
word_for_download()
print("success")
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from pyspark.sql.types import StringType
from utils.templates import Templates
from google.cloud import translate_v2 as translate
class Test(Templates):
def __init__(self):
super().__init__()
self.spark = self.create_spark_object(app_name=f"test")
self.df_st = self.spark.sql(f"select 1+1;")
self.translate_client = translate.Client()
# 自定义udf
self.u_translate_text = self.spark.udf.register('translate_text', self.translate_text, StringType())
def translate_text(self, word: str, target_language='zh'):
result = self.translate_client.translate(word, target_language=target_language)
return result['translatedText']
def read_data(self):
sql1 = f"""
select
search_term
from dwt_aba_last365
where site_name = 'us'
and date_type = 'last365day'
and date_info = '2023-12';
"""
self.df_st = self.spark.sql(sql1).limit(20).cache()
def handle_data(self):
self.df_st = self.df_st.withColumn(
'translate_text',
self.u_translate_text(self.df_st['search_term'])
)
self.df_st.show(20, False)
if __name__ == '__main__':
handle_obj = Test()
handle_obj.run()
import os
import sys
import json
sys.path.append(os.path.dirname(sys.path[0]))
from utils.db_util import DBUtil
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
assert site_name is not None, "site_name 不能为空!"
assert date_type is not None, "date_type 不能为空!"
assert date_info is not None, "date_info 不能为空!"
hive_table = f"dwt_flow_asin"
partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
# 获取计算分区
msg_params = ""
# 解析partition_dict获取分区查询条件
partition_conditions = []
for key, value in partition_dict.items():
if value is not None:
msg_params += f"{value} "
partition_conditions.append(f"{key} = '{value}'")
base_msg = f"{hive_table} {msg_params} "
site_name = partition_dict.get("site_name")
date_type = partition_dict.get("date_type")
spark_session = SparkUtil.get_spark_sessionV3("check_fields_rule")
# 获取维护的字段验证配置表数据
config_table_query = f"""select * from hive_field_verify_config
where table_name ='{hive_table}'
and site_name = '{site_name}'
and use_flag = 1 """
conn_info = DBUtil.get_connection_info('postgresql', 'us')
check_field_df = SparkUtil.read_jdbc_query(
session=spark_session,
url=conn_info["url"],
pwd=conn_info["pwd"],
username=conn_info["username"],
query=config_table_query
)
# 获取验证消息
check_field_list = check_field_df.select('field_name', 'verify_desc', 'verify_type', 'config_json',
'msg_usr_list').collect()
if not check_field_list:
print("============================无验证匹配条件跳过验证===================================")
exit()
# 创建一个df用于储存验证情况
# 定义列的结构
schema = StructType([
StructField("验证描述", StringType(), True),
StructField("验证类型", StringType(), True),
StructField("校验字段", StringType(), True),
StructField("校验条件查询占比", StringType(), True),
StructField("验证占比临界值上限", StringType(), True),
StructField("验证占比临界值下限", StringType(), True),
StructField("是否验证通过", IntegerType(), True),
])
# 使用定义的结构创建空的 DataFrame
check_df = spark_session.createDataFrame([], schema)
# 进行验证sql组装
query = f"""
SELECT COUNT(1) AS total_count
FROM {hive_table}
"""
# 拼接where条件
if partition_conditions:
query_total = query + f" WHERE {' AND '.join(partition_conditions)}"
# 执行sql获取验证值与df
total_df = spark_session.sql(query_total).cache()
total_count = int(total_df.collect()[0]['total_count'])
for row in check_field_list:
vertify_flag = True
field_name = row['field_name']
verify_type = row['verify_type']
config_json = json.loads(row['config_json'])
msg_usr = row['msg_usr_list']
msg_usr_list = [user.strip() for user in msg_usr.split(",")] if msg_usr else []
sql_condition = config_json['sql_condition']
partition_conf_list = config_json['partition_conf']
for conf in partition_conf_list:
conf_site_name = conf["site_name"]
conf_date_type = conf["date_type"]
if site_name == conf_site_name and date_type == conf_date_type:
vertify_flag = True
break
else:
vertify_flag = False
# 没有合适的匹配维度
if not vertify_flag:
break
# 拼接外部查询条件
if sql_condition:
query_field_check = query_total + f" AND {sql_condition} "
check_count_df = spark_session.sql(query_field_check).cache()
check_count = int(check_count_df.collect()[0]['total_count'])
calcult_rate = round((check_count / total_count), 3)
waring_max = conf['max_rate']
waring_min = conf['min_rate']
verify_flag = 1 if (calcult_rate <= waring_max) and (calcult_rate >= waring_min) else 0
ratio_df = spark_session.createDataFrame([(row['verify_desc'],verify_type,field_name,calcult_rate,waring_max,waring_min,verify_flag)],schema).repartition(1)
check_df = check_df.unionByName(ratio_df, False)
if check_df.count() < 1 :
print("无验证项验证")
exit()
check_df.show(50, truncate=False)
# 对校验结果进行判断是否有校验不通过的数据
schema_flag = bool(check_df.select(F.min("是否验证通过").alias("result")).first().asDict()['result'])
if not schema_flag:
msg = f"数据表:{hive_table} {msg_params},计算数据存在验证不通过,请检查数据是否异常!!具体信息请查看日志!!"
CommonUtil.send_wx_msg(['chenjianyun'], f"\u26A0 {hive_table} {msg_params}流程数据导出前验证异常", msg)
spark_session.stop()
pass
\ No newline at end of file
def word_tokenize(title: str):
"""
分词器
"""
from nltk.tokenize import word_tokenize
result = word_tokenize(title, "english")
return result
if __name__ == '__main__':
aba = "nation's bravest tales of courage and heroism"
print(word_tokenize(aba))
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
if __name__ == '__main__':
export_tb = "de_brand_analytics_month"
spark = SparkUtil.get_spark_session("update_de_brand_analytics_month_2024_05")
sql1 = """
select
search_term
from ods_st_quantity_being_sold
where site_name = 'de'
and date_type = 'month'
and date_info = '2024-05'
and quantity_being_sold in (16, 48)
"""
df_aba = spark.sql(sql1)
sql2 = """
select
search_term,
quantity_being_sold
from dwt_merchantwords_merge
where site_name = 'de'
"""
df_me = spark.sql(sql2)
df_save = df_aba.join(
df_me, on='search_term', how='inner'
)
# 导出数据到 PostgreSQL 数据库
df_save.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.223:5433/selection_de") \
.option("dbtable", export_tb) \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
\ No newline at end of file
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from pyspark.sql.functions import row_number
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("UpdateMerchantwords")
hive_tb = 'dwd_merchantwords_measure'
partition_dict = {
"site_name": 'us',
"batch": '2023-01'
}
sql1 = f"""
select
keyword,
lang,
st_ao_val,
st_zr_flow_proportion,
min_bid,
max_bid,
suggested_bid,
volume,
avg_3m,
avg_12m,
asin_total_num,
asin_num,
self_asin_num,
self_asin_proportion,
st_sp_counts,
st_zr_counts,
st_monthly_sales,
listing_sales_avg,
reviews_avg,
rating_avg,
price_avg,
depth
from dwd_merchantwords_measure
where site_name = 'us'
and batch = '2023-01';
"""
df_dwd = spark.sql(sqlQuery=sql1).cache()
df_dwd.repartition(80)
sql2 = f"""
select
keyword,
results_count,
sponsored_ads_count,
page_1_reviews,
appearance,
last_seen,
update_time
from dwt_merchantwords_st_detail
where site_name = 'us'
and batch = '2023-1';
"""
df_merchantwords_detail = spark.sql(sqlQuery=sql2)
df_merchantwords_detail = df_merchantwords_detail \
.withColumn("row_num", row_number().over(Window.orderBy("keyword"))) \
.filter("row_num BETWEEN 1 AND 12000000") \
.repartition(80) \
.drop("row_num") \
.cache()
df = df_dwd.join(df_merchantwords_detail, 'keyword', 'left')
df = df.withColumn(
'site_name',
F.lit('us')
).withColumn(
'batch',
F.lit('2023-01')
)
CommonUtil.save_or_update_table(spark_session=spark,
hive_tb_name=hive_tb,
partition_dict=partition_dict,
df_save=df,
drop_exist_tmp_flag=True)
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import count, col
class WordFrequency(object):
def __init__(self):
self.spark = SparkUtil.get_spark_session("us_aba_last365_word_frequency")
def run(self):
sql1 = f"""
select search_term, date_info
from dwt_aba_st_analytics
where site_name = 'us'
and date_type = 'month'
and date_info in
('2024-10', '2024-09', '2024-08', '2024-07', '2024-06', '2024-05',
'2024-04', '2024-03', '2024-02', '2024-01', '2023-12', '2023-11')
and rank <= 1000000
and st_brand_label = 1;
"""
df_st = self.spark.sql(sql1).cache()
print("df_st数量是:")
print(df_st.count())
sql2 = f"""
select search_term, first_match_brand as brand, date_info
from dws_st_brand_info
where site_name = 'us'
and date_type = 'month'
and date_info in
('2024-10', '2024-09', '2024-08', '2024-07', '2024-06', '2024-05',
'2024-04', '2024-03', '2024-02', '2024-01', '2023-12', '2023-11')
and st_brand_label = 1;
"""
df_brand = self.spark.sql(sql2).cache()
print("df_brand数量是:")
print(df_brand.count())
df_save = df_st.join(
df_brand, on=['date_info', 'search_term'], how='left'
).drop('date_info')
print("df_save数量是:")
print(df_save.count())
df_save = df_save.groupby(['brand']).agg(
count('brand').alias('frequency')
).orderBy('frequency', ascending=False)
df_save.show(20, False)
df_save = df_save.withColumn("frequency", col("frequency").cast("int"))
total_sum = df_save.select("frequency").groupBy().sum().collect()[0][0]
if total_sum == df_st.count():
print('验证成功')
else:
print('验证失败')
output_path = "hdfs:///user/chenyuanjie/test1/"
df_save.write.mode("overwrite").format("csv").option("delimiter", "^").option("lineSep", "\n").option("header", "false").option("compression", "none").save(output_path)
if __name__ == '__main__':
obj = WordFrequency()
obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__':
hive_tb = "tmp_us_st_keepa_syn_2024"
hdfs_path = "/home/big_data_selection/tmp/tmp_us_st_keepa_syn_2024"
print(f"hdfs_path is {hdfs_path}")
query = f"""
select
asin
from us_st_keepa_syn_2024
where 1 = 1
and \$CONDITIONS
"""
db_type = "postgresql"
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type,
site_name='us',
query=query,
hive_tb_name=hive_tb,
msg_usr=['chenyuanjie']
)
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if not empty_flag:
sh = CommonUtil.build_import_sh(site_name='us',
db_type=db_type,
query=query,
hdfs_path=hdfs_path)
# 导入前先删除
HdfsUtils.delete_hdfs_file(hdfs_path)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
client.close()
pass
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql.functions import col, lit
from utils.StarRocksHelper import StarRocksHelper
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("us_st_keepa_syn_2024_export")
# 从SR数据库中读取已有数据
sql = """
select distinct asin from selection.us_asin_latest_detail where date_info = '2024-06'
"""
df_sr = StarRocksHelper.spark_import_with_sql(spark, sql).repartition(80, 'asin').cache()
print("starrocks读取:")
df_sr.show(10)
sql = """
select asin from tmp_us_st_keepa_syn_2024;
"""
df_pg = spark.sql(sql).drop_duplicates(['asin']).repartition(80, 'asin').cache()
print("pg读取:")
df_pg.show(10)
df = df_sr.subtract(df_pg)
df_sr.unpersist()
df_pg.unpersist()
df = df.withColumn(
'state',
lit(7)
).withColumn(
'asin_trun_4',
col('asin').substr(1, 4)
)
df.show(10)
print(df.count())
df.write.format("jdbc") \
.option("url", "jdbc:postgresql://192.168.10.224:5433/selection") \
.option("dbtable", "us_st_keepa_syn_2024") \
.option("user", "yswg_postgres") \
.option("password", "yswg_postgres") \
.mode("append") \
.save()
spark.stop()
import os
import sys
from sqlalchemy.dialects.postgresql import pypostgresql
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from utils.db_util import DBUtil
from utils.StarRocksHelper import StarRocksHelper
if __name__ == '__main__':
spark = SparkUtil.get_spark_session("us_st_keepa_syn_2024_export")
# 从SR数据库中读取已有数据
sql = """
select distinct asin from selection.us_asin_latest_detail where date_info = '2024-06' and (asin_launch_time>'2024-07-19' or asin_launch_time<'1990-01-01')
"""
df_sr = StarRocksHelper.spark_import_with_sql(spark, sql).repartition(80, 'asin').cache()
print("starrocks读取:")
df_sr.show(10)
sql = """
select asin from tmp_us_st_keepa_syn_2024;
"""
df_pg = spark.sql(sql).drop_duplicates(['asin']).repartition(80, 'asin').cache()
print("pg读取:")
df_pg.show(10)
df = df_sr.subtract(df_pg)
print(df.count())
df_sr.unpersist()
df_pg.unpersist()
update_asin = df.select("asin").rdd.map(lambda row: row[0]).collect()
print(update_asin)
pg_engine = DBUtil.get_db_engine('postgresql', 'us')
with pg_engine.begin() as conn:
update_query = f"""
UPDATE us_st_keepa_syn_2024 SET state = 5 WHERE asin IN {tuple(update_asin)}
"""
conn.execute(update_query)
spark.stop()
import os
import sys
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, StringType
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
class VerifyRank(object):
def __init__(self):
self.spark = SparkUtil.get_spark_session("{self.__class__.__name__}")
def run(self):
sql = f"""
select
search_term,
rank,
date_info
from ods_brand_analytics
where site_name = 'us'
and date_type = 'week'
and date_info >= '2024-01'
and rank < 100000
"""
df_all = self.spark.sql(sql).repartition(40, 'search_term').cache()
def leave_one_out_means(structs):
ranks = [x['rank'] for x in structs]
date_infos = [x['date_info'] for x in structs]
total_sum = sum(ranks)
n = len(ranks)
if n > 1:
means = [round((total_sum - rank) / (n - 1), 2) for rank in ranks]
else:
means = [ranks[0]]
result = [{"means": mean, "date_info": date_info} for mean, date_info in zip(means, date_infos)]
return result
leave_one_out_means_udf = F.udf(leave_one_out_means, ArrayType(StructType([
StructField("means", FloatType(), True),
StructField("date_info", StringType(), True)
])))
df_agg = df_all.groupBy("search_term").agg(
F.collect_list(F.struct("rank", "date_info")).alias("collect_row")
# F.collect_list("rank").alias("values")
)
df_agg = df_agg.withColumn(
"collect_row", leave_one_out_means_udf(F.col("collect_row"))
)
def calc_quantiles(structs):
values = [x['means'] for x in structs]
values = sorted(values) # 将组内的数值进行排序
n = len(values)
# 计算 Q1 和 Q3 的位置(基于 25% 和 75% 的位置)
q1_index = int(n * 0.25)
q3_index = int(n * 0.75)
if n > 1:
q1 = values[q1_index]
q3 = values[q3_index]
else:
q1 = values[0]
q3 = values[0]
return [float(q1), float(q3)]
quantile_udf = F.udf(calc_quantiles, ArrayType(FloatType()))
df_agg = df_agg.withColumn(
"quantiles", quantile_udf(F.col("collect_row"))
).withColumn(
"q1", F.col("quantiles")[0]
).withColumn(
"q3", F.col("quantiles")[1]
).withColumn(
"iqr", F.expr("q3 - q1")
).withColumn(
"lower_bound", F.expr("q1 - 100 * iqr")
).withColumn(
"upper_bound", F.expr("q3 + 100 * iqr")
).select(
'search_term', 'collect_row', 'lower_bound', 'upper_bound'
).repartition(40, 'search_term')
df_save = df_agg.withColumn(
"filtered_collect_row",
F.filter(
"collect_row",
lambda x: (x["means"] < F.col("lower_bound")) | (x["means"] > F.col("upper_bound"))
)
).filter(
F.size(F.col("filtered_collect_row")) > 0
).withColumn(
"has_2024_08",
F.exists(
"filtered_collect_row",
lambda x: x["date_info"].like("2024-08%")
)
).filter(
~F.col("has_2024_08") # 过滤掉包含 '2024-08' 的行
).select(
'search_term', 'filtered_collect_row', 'lower_bound', 'upper_bound'
)
df_save.show(20, False)
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
obj = VerifyRank()
obj.run()
......@@ -9,8 +9,6 @@ from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
# 导入udf公共方法
from yswg_utils.common_udf import udf_parse_bs_category
# from ..yswg_utils.common_udf import udf_parse_bs_category
from utils.spark_util import SparkUtil
from utils.hdfs_utils import HdfsUtils
......@@ -39,7 +37,7 @@ class DimAsinAmordersInfo(Templates):
# us month, month_week, 4_week, week
# uk/de month, 4_week, week
if self.site_name in ['us', 'uk', 'de']:
if self.date_type in ['month', 'month_week']:
if self.date_type in ['month', 'month_week', 'month_aba_me']:
if (self.site_name == 'us') or (self.site_name in ['uk', 'de'] and self.date_info >= '2024-05'):
params = f"date_type='{self.date_type}' and date_info = '{self.date_info}'"
else:
......@@ -69,6 +67,8 @@ class DimAsinAmordersInfo(Templates):
results_list = re.findall(pattern, str(asin_amazon_orders_str).lower())
if len(results_list) == 1:
result = int(results_list[0].replace("k", "000").replace(" ", ""))
if 'week' in asin_amazon_orders_str:
result = result * 4
else:
result = None
return result
......
......@@ -106,7 +106,7 @@ class DimAsinDetail(object):
df_loc = df.loc[df.id == int(cur_id) - 21]
week4 = list(df_loc.year_week)[0]
complete_date_info_tuple = (week1, week2, week3, week4)
elif self.date_type in ['month', 'month_week']:
elif self.date_type in ['month', 'month_week', 'month_aba_me']:
df_loc = df.loc[(df.year_month == f"{self.date_info}") & (df.week_day == 1)]
complete_date_info_tuple = tuple(df_loc.year_week)
print("self.complete_date_info_tuple:", complete_date_info_tuple)
......@@ -118,6 +118,8 @@ class DimAsinDetail(object):
date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
elif self.date_type == DateTypes.month.name and self.date_info >= '2023-10':
date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
elif self.date_type == 'month_aba_me':
date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
else:
date_sql = f"and date_type='week' and date_info in {self.complete_date_info_tuple}"
elif self.site_name in ['uk', 'de']:
......@@ -154,8 +156,9 @@ class DimAsinDetail(object):
five_star, low_star, together_asin, ac_name, node_id, data_type as asin_data_type, variat_list,
`describe` as asin_describe, follow_sellers as asin_follow_sellers, product_description,
image_view as asin_image_view, spider_int as asin_spider_num, buy_sales, lob_asin_json as asin_lob_info,
REGEXP_REPLACE(seller_json, chr(10), '') as seller_json, buy_box_seller_type as asin_buy_box_seller_type, customer_reviews_json, parent_asin, img_list,
created_at as created_time, updated_at as updated_time, updated_at as dt, variat_num as variation_num
REGEXP_REPLACE(seller_json, chr(10), '') as seller_json, buy_box_seller_type as asin_buy_box_seller_type,
customer_reviews_json, parent_asin, img_list, created_at as created_time, updated_at as updated_time,
updated_at as dt, variat_num as variation_num, fbm_delivery_price as asin_fbm_price
from ods_asin_detail where site_name='{self.site_name}' {self.date_sql}"""
print(sql)
self.df_asin_detail = self.spark.sql(sqlQuery=sql)
......@@ -208,7 +211,7 @@ class DimAsinDetail(object):
"category_first_name", F.lower("category_first_name")
).repartition(100).persist(StorageLevel.DISK_ONLY)
self.df_asin_category.show(10, truncate=False)
if self.date_type in ['month', 'month_week'] and self.date_info < '2024-06':
if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info < '2024-06':
sql = f"""
SELECT asin, parent_asin, color as asin_color, `size` as asin_size, style as asin_style,
CASE WHEN state = 1 THEN 1 WHEN state = 2 THEN 0 ElSE NULL END as asin_is_sale, updated_time
......@@ -259,7 +262,7 @@ class DimAsinDetail(object):
# 处理asin的变体信息
def handle_asin_variation_attribute(self):
if self.date_type in ['month', 'month_week'] and self.date_info >= '2024-06':
if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-06':
print("执行新版的变体信息整合")
variat_schema = ArrayType(ArrayType(StringType()))
self.df_asin_detail = self.df_asin_detail.withColumn("variat_list_change", F.from_json(F.col("variat_list"), variat_schema))
......@@ -278,7 +281,7 @@ class DimAsinDetail(object):
self.df_asin_detail = self.df_asin_detail.join(
df_asin_with_variation, on=['asin'], how='left'
)
elif self.date_type in ['month', 'month_week'] and self.date_info < '2024-06':
elif self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info < '2024-06':
print("执行历史数据的变体信息整合")
window = Window.partitionBy(self.df_asin_variat.asin).orderBy(
self.df_asin_variat.updated_time.desc_nulls_last())
......@@ -298,7 +301,7 @@ class DimAsinDetail(object):
# 处理asin的配送方式信息
def handle_asin_buy_box_seller_type(self):
if (self.date_type in ['month', 'month_week'] and self.date_info >= '2024-05') \
if (self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-05') \
or (self.date_type == '4_week' and self.date_info >= '2024-21'):
self.df_asin_detail = self.df_asin_detail.withColumn(
'seller_json',
......@@ -440,7 +443,7 @@ class DimAsinDetail(object):
# 处理asin小图信息
def handle_asin_img_info(self):
if self.date_type in ['month', 'month_week'] and self.date_info >= '2024-06':
if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-06':
img_schema = ArrayType(ArrayType(StringType()))
df_asin_with_img = self.df_asin_detail.withColumn("img_list", F.from_json(F.col("img_list"), img_schema)).filter(F.size("img_list") > 0).\
select("asin", F.explode("img_list").alias("img_attributes")).\
......@@ -456,7 +459,7 @@ class DimAsinDetail(object):
# 处理parent_asin下最新变体信息
def handle_latest_variation_info(self):
if self.date_type in ['month', 'month_week'] and self.date_info >= '2024-06':
if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-06':
max_report_sql = f"""
SELECT MAX(date_info) as table_date_info FROM {self.doris_db}.{self.parent_asin_latest_detail_table}
"""
......@@ -541,6 +544,7 @@ class DimAsinDetail(object):
"package_quantity", "is_package_quantity_abnormal", "asin_quantity_variation_type", "seller_json",
"asin_bought_month", "asin_length", "asin_width", "asin_height", "asin_is_self",
"customer_reviews_json", "img_list", "variat_list",
F.round("asin_fbm_price", 2).alias("asin_fbm_price"),
F.lit(self.site_name).alias('site_name'),
F.lit(self.date_type).alias('date_type'),
F.lit(self.date_info).alias('date_info')).persist(StorageLevel.MEMORY_ONLY)
......
......@@ -47,6 +47,8 @@ class DimAsinLabel(object):
date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
elif self.date_type == DateTypes.month.name and self.date_info >= '2023-10':
date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
elif self.date_type == 'month_aba_me':
date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
else:
date_sql = f"and date_type='week' and date_info in {self.complete_date_info_tuple}"
elif self.site_name in ['uk', 'de']:
......
......@@ -84,7 +84,7 @@ class DimAsinLaunchtimeInfo(object):
else:
print("==================执行分区数据整合=================")
# 按分区检测是否有新增的asin
if self.date_type in (DateTypes.week.name,DateTypes.month.name,DateTypes.month_week.name):
if self.date_type in (DateTypes.week.name,DateTypes.month.name,DateTypes.month_week.name, 'month_aba_me'):
# 取st维度的st下的asin数据
sql = f"""
select
......
......@@ -189,7 +189,9 @@ class DimAsinStableInfo(Templates):
def read_data(self):
if self.site_name == 'us':
params = f" and (date_type='week' or (date_type='month' and date_info='2023-10') or (date_type in ('month_week', 'month') and date_info>='2023-11'))"
# 选取最近1年的详情来计算
params = f" and date_type in ('month_week', 'month', 'month_aba_me') and date_info>='2025-01';"
# params = f" and (date_type='week' or (date_type='month' and date_info='2023-10') or (date_type in ('month_week', 'month') and date_info>='2023-11'))"
else:
params = f" and (date_type='week' or (date_type in ('month_week', 'month') and date_info>='2023-05'))"
......
......@@ -95,7 +95,7 @@ class DimCalAsinDetail(object):
and date_type='month' ;
"""
self.date_type = 'day_all'
elif self.date_type in (DateTypes.week.name, DateTypes.month.name, DateTypes.month_week.name):
elif self.date_type in (DateTypes.week.name, DateTypes.month.name, DateTypes.month_week.name, 'month_aba_me'):
sql = f"""select
asin,
asin_img_url,
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
from utils.db_util import DBUtil
class DwdAiAsinAdd(object):
def __init__(self, site_name="us", date_type="month", date_info="2024-10"):
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
self.spark = SparkUtil.get_spark_session(app_name)
# 获取历史date_info
# 环比上月
self.date_info_last_month = CommonUtil.get_month_offset(self.date_info, -1)
# 同比去年
self.date_info_last_year = CommonUtil.get_month_offset(self.date_info, -12)
self.df_base_asin = self.spark.sql(f"select 1+1;")
self.df_asin_last_month = self.spark.sql(f"select 1+1;")
self.df_ai_asin = self.spark.sql(f"select 1+1;")
self.df_save1 = self.spark.sql(f"select 1+1;")
self.df_save2 = self.spark.sql(f"select 1+1;")
def run(self):
self.read_data()
self.handle_data()
self.save_data()
def read_data(self):
# 基于流量选品筛选ASIN信息库所需asin
sql1 = f"""
select
asin,
asin_bought_month,
category_id,
asin_category_desc
from dwt_flow_asin
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}'
and asin_type in (0, 1)
and asin_bought_month >= 50
"""
self.df_base_asin = self.spark.sql(sqlQuery=sql1).repartition(40, 'asin')
# 筛选需要过滤掉的分类
sql2 = f"""
select distinct category_id as category_id from category_full_name a
where EXISTS (
select 1 from category_disable_config b where b.id_path is not null and a.id_path like concat(b.id_path, '%') and a.site = b.site
) and a.site = '{self.site_name}'
"""
conn_info = DBUtil.get_connection_info("mysql", "us")
df_filter_category_id = SparkUtil.read_jdbc_query(
session=self.spark,
url=conn_info["url"],
pwd=conn_info["pwd"],
username=conn_info["username"],
query=sql2
)
# 二次过滤分类防止遗漏
sql3 = f"""
select distinct name_path as asin_category_desc from category_disable_config where site = '{self.site_name}'
"""
df_filter_category_desc = SparkUtil.read_jdbc_query(
session=self.spark,
url=conn_info["url"],
pwd=conn_info["pwd"],
username=conn_info["username"],
query=sql3
)
# 读取流量选品历史数据
sql4 = f"""
select
asin,
asin_bought_month,
date_info
from dwt_flow_asin
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info in ('{self.date_info_last_month}', '{self.date_info_last_year}')
"""
df_flow_asin = self.spark.sql(sqlQuery=sql4).cache()
df_flow_asin_last_month = df_flow_asin.filter(f"date_info = '{self.date_info_last_month}'").withColumnRenamed(
'asin_bought_month', 'asin_bought_last_month'
).drop('date_info').repartition(40, 'asin')
df_flow_asin_last_year = df_flow_asin.filter(f"date_info = '{self.date_info_last_year}'").withColumnRenamed(
'asin_bought_month', 'asin_bought_last_year'
).drop('date_info').repartition(40, 'asin')
# 关联历史数据
self.df_base_asin = self.df_base_asin.join(
df_filter_category_id, 'category_id', 'left_anti'
).join(
df_filter_category_desc, 'asin_category_desc', 'left_anti'
).join(
df_flow_asin_last_month, 'asin', 'left'
).join(
df_flow_asin_last_year, 'asin', 'left'
).drop('category_id', 'asin_category_desc').cache()
df_flow_asin.unpersist()
print("ASIN信息库基础数据如下:")
self.df_base_asin.show(10, truncate=True)
# 读取上个月维度asin,判断新增asin
sql5 = f"""
select asin, 0 as asin_is_new_flag
from dwd_ai_asin_add
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info_last_month}'
"""
self.df_asin_last_month = self.spark.sql(sqlQuery=sql5).repartition(40, 'asin')
# 读取ASIN信息库历史数据
sql6 = f"""
select asin from dim_ai_asin_base where site_name = '{self.site_name}'
"""
self.df_ai_asin = self.spark.sql(sqlQuery=sql6).repartition(40, 'asin').cache()
print(f"ASIN信息库历史数量:{self.df_ai_asin.count()}")
def handle_data(self):
# 计算月销同比、环比
self.df_base_asin = self.df_base_asin.withColumn(
'asin_bought_yoy',
F.when(
F.col("asin_bought_month").isNull() & F.col("asin_bought_last_year").isNull(), F.lit(None)
).when(
F.col("asin_bought_month").isNull(), F.lit(-1000.0000)
).when(
F.col("asin_bought_last_year").isNull(), F.lit(1000.0000)
).otherwise(
F.round((F.col("asin_bought_month") - F.col("asin_bought_last_year")) / F.col("asin_bought_last_year"), 4)
)
).withColumn(
'asin_bought_mom',
F.when(
F.col("asin_bought_month").isNull() & F.col("asin_bought_last_month").isNull(), F.lit(None)
).when(
F.col("asin_bought_month").isNull(), F.lit(-1000.0000)
).when(
F.col("asin_bought_last_month").isNull(), F.lit(1000.0000)
).otherwise(
F.round((F.col("asin_bought_month") - F.col("asin_bought_last_month")) / F.col("asin_bought_last_month"), 4)
)
)
# 给asin打销量标签:1.上升、2.不变、3.下降、0.默认
self.df_base_asin = self.df_base_asin.withColumn(
'asin_bought_month_flag',
F.when(
F.col("asin_bought_month").isNull() | F.col("asin_bought_last_month").isNull(), F.lit(0)
).when(
F.col("asin_bought_month") - F.col("asin_bought_last_month") > 0, F.lit(1)
).when(
F.col("asin_bought_month") - F.col("asin_bought_last_month") == 0, F.lit(2)
).when(
F.col("asin_bought_month") - F.col("asin_bought_last_month") < 0, F.lit(3)
).otherwise(F.lit(0))
)
# 给asin打新增标签(基于ASIN信息库):1.新增、0.默认
self.df_base_asin = self.df_base_asin.join(
self.df_asin_last_month, 'asin', 'left'
).fillna({
'asin_is_new_flag': 1
}).cache()
def save_data(self):
# 字段标准化
self.df_save1 = self.df_base_asin.select(
F.col("asin"),
F.col("asin_bought_month"),
F.col("asin_bought_last_month"),
F.col("asin_bought_last_year"),
F.col("asin_bought_mom"),
F.col("asin_bought_yoy"),
F.col("asin_bought_month_flag"),
F.col("asin_is_new_flag"),
F.lit(self.site_name).alias("site_name"),
F.lit(self.date_type).alias("date_type"),
F.lit(self.date_info).alias("date_info")
).repartition(1)
# dwd_ai_asin_add数据存储
partition_by1 = ["site_name", "date_type", "date_info"]
hive_tb1 = "dwd_ai_asin_add"
hdfs_path1 = CommonUtil.build_hdfs_path(
hive_tb1,
partition_dict={
"site_name": self.site_name,
"date_type": self.date_type,
"date_info": self.date_info,
}
)
HdfsUtils.delete_file_in_folder(hdfs_path1)
print(f"正在进行数据存储,当前存储的表名为:{hive_tb1},存储路径:{hdfs_path1}")
self.df_save1.write.saveAsTable(name=hive_tb1, format='hive', mode='append', partitionBy=partition_by1)
if self.date_info >= '2024-10':
self.df_save2 = self.df_base_asin.select(F.col("asin")).unionByName(
self.df_ai_asin
).select(
F.col("asin"),
F.lit(self.site_name).alias("site_name")
).distinct().repartition(1)
# dim_ai_asin_base数据存储
partition_by2 = ["site_name"]
hive_tb2 = "dim_ai_asin_base"
hdfs_path2 = CommonUtil.build_hdfs_path(
hive_tb2,
partition_dict={
"site_name": self.site_name,
}
)
HdfsUtils.delete_file_in_folder(hdfs_path2)
print(f"正在进行数据存储,当前存储的表名为:{hive_tb2},存储路径:{hdfs_path2}")
self.df_save2.write.saveAsTable(name=hive_tb2, format='hive', mode='append', partitionBy=partition_by2)
print("success!")
if __name__ == "__main__":
site_name = sys.argv[1]
date_type = sys.argv[2]
date_info = sys.argv[3]
handle_obj = DwdAiAsinAdd(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from utils.templates import Templates
from pyspark.sql import functions as F
class DwdAmazonReport(Templates):
def __init__(self, site_name='us', date_type="month", date_info='2021-10'):
super().__init__()
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
self.db_save = f'dwd_amazon_report'
self.spark = self.create_spark_object(
app_name=f"{self.db_save}: {self.site_name}, {self.date_type}, {self.date_info}")
self.reset_partitions(partitions_num=5)
self.partitions_by = ['site_name', 'date_type', 'date_info']
self.df_buy_data = self.spark.sql(f"select 1+1;")
self.df_st_count = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
def read_data(self):
# 读取asin的月销数据
sql1 = f"""
select
asin,
asin_amazon_orders as monthly_sales
from
dim_asin_amorders_info
where
site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}';
"""
print(sql1)
self.df_buy_data = self.spark.sql(sqlQuery=sql1).repartition(15, 'asin').cache()
self.df_buy_data.show(10, truncate=True)
sql2 = f"""
select
asin,
asin_zr_counts as zr_count,
asin_sp_counts as sp_count,
asin_st_counts as total_count
from
dwd_asin_measure
where
site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}';
"""
print(sql2)
self.df_st_count = self.spark.sql(sqlQuery=sql2).repartition(15, 'asin').cache()
self.df_st_count.show(10, truncate=True)
def handle_data(self):
hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwd/{self.db_save}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
print(f"清除hdfs目录中.....{hdfs_path}")
HdfsUtils.delete_hdfs_file(hdfs_path)
self.df_save = self.df_buy_data.join(
self.df_st_count, on='asin', how='full'
)
columns = self.df_save.columns
for col_name in columns:
self.df_save = self.df_save.withColumn(
col_name, self.df_save[col_name].cast('string')
)
self.df_save = self.df_save.fillna('-1')
self.df_save = self.df_save.withColumn(
"weekly_sales", F.lit(None)
).withColumn(
"weekly_views", F.lit(None)
).withColumn(
"monthly_views", F.lit(None)
).withColumn(
"site_name", F.lit(self.site_name)
).withColumn(
"date_type", F.lit(self.date_type)
).withColumn(
"date_info", F.lit(self.date_info)
)
if __name__ == '__main__':
site_name = sys.argv[1]
date_type = sys.argv[2]
date_info = sys.argv[3]
if (site_name in ['us', 'uk', 'de']) and (date_type == 'month') and (date_info >= '2024-04'):
handle_obj = DwdAmazonReport(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
else:
print("暂不计算该维度数据!")
quit()
......@@ -217,7 +217,7 @@ class DwdAsinToPg(Templates):
self.df_save = self.df_save.fillna({"asin_is_variation": 0})
self.df_save.show(10, truncate=False)
print("self.df_save.count:", self.df_save.count())
users = ["fangxingjun", "wangrui4", "pengyanbing"]
users = ["fangxingjun", "chenyuanjie", "pengyanbing"]
title = f"dwd_asin_to_pg: {self.site_name}, {self.date_type}, {self.date_info}"
content = f"整合asin完成--等待导出到pg提供爬虫使用--self.df_save.count: {self.df_save.count()}"
CommonUtil().send_wx_msg(users=users, title=title, content=content)
......
......@@ -126,6 +126,8 @@ class DwdStMeasure(Templates):
if date_type in ['month', 'month_week'] and ((self.site_name == 'us' and date_info >= '2023-10') or (self.site_name in ['uk', 'de'] and self.date_info >= '2024-05')):
sql = f"select * from dim_st_asin_info where site_name='{self.site_name}' and date_type='month' and date_info ='{self.date_info}'"
elif date_type in ['month_aba_me']:
sql = f"select * from dim_st_asin_info where site_name='{self.site_name}' and date_type='month_aba_me' and date_info ='{self.date_info}'"
# else:
# if (int(self.year) == 2022 and int(self.month) < 10) or int(self.year) <= 2021:
......@@ -175,7 +177,7 @@ class DwdStMeasure(Templates):
self.df_asin_bs = self.spark.sql(sql).cache()
self.df_asin_bs.show(10)
sql = f"select asin, asin_title, asin_price, parent_asin " \
sql = f"select asin, asin_title, asin_price, parent_asin, asin_bought_month " \
f"from dim_asin_detail where site_name='{self.site_name}' and date_type='{self.date_type.replace('_old', '')}' and date_info='{self.date_info}';"
print("sql:", sql)
self.df_asin_detail = self.spark.sql(sql).cache()
......@@ -239,6 +241,8 @@ class DwdStMeasure(Templates):
self.handle_join()
self.df_save_asin = self.handle_st_asin_counts(cal_type="asin", df_templates=self.df_asin_templates, page=3)
self.df_save_st = self.handle_st_asin_counts(cal_type="st", df_templates=self.df_st_templates)
self.df_save_st.show(10, truncate=False)
print(f"self.df_save_st: {self.df_save_st.count()}")
# self.handle_st_zr_page1_title_rate()
self.handle_st_zr_sp_page123_title_rate(data_type='zr', page_type='page1')
self.handle_st_zr_sp_page123_title_rate(data_type='zr', page_type='page123')
......@@ -413,17 +417,34 @@ class DwdStMeasure(Templates):
df_st_asin_agg, on=['asin'], how='left'
)
elif cal_type == "st":
# 计算搜索词DD占比
df_asin_bought_month = self.df_asin_detail.select('asin', 'asin_bought_month').join(
self.df_asin_amazon_orders, on=['asin'], how='left'
).withColumn(
"asin_bought_month", F.coalesce(F.col("asin_bought_month"), F.col("asin_amazon_orders"))
).drop("asin_amazon_orders")
df_st_asin_agg = self.df_st_asin_duplicated.select("search_term", "asin").join(
self.df_asin_self, on='asin', how='left'
).join(
df_asin_bought_month, on='asin', how='left'
).withColumn(
"is_self_asin",
F.when(F.col("is_self_asin").isNotNull(), F.col("is_self_asin")).otherwise(F.lit(0))
).groupby(['search_term']).agg(
F.sum('is_self_asin').alias("st_self_asin_counts"),
F.count('asin').alias("st_total_asin_counts")
F.count('asin').alias("st_total_asin_counts"),
F.sum(F.when(F.col("asin_bought_month") >= 50, 1).otherwise(0)).alias("st_dd50_counts"),
F.sum(F.when(F.col("asin_bought_month") >= 100, 1).otherwise(0)).alias("st_dd100_counts"),
F.sum(F.when(F.col("asin_bought_month") >= 200, 1).otherwise(0)).alias("st_dd200_counts")
).withColumn(
'st_self_asin_proportion', F.round(F.col('st_self_asin_counts') / F.col('st_total_asin_counts'), 4)
).withColumn(
'st_dd50_proportion', F.round(F.col('st_dd50_counts') / F.col('st_total_asin_counts'), 4)
).withColumn(
'st_dd100_proportion', F.round(F.col('st_dd100_counts') / F.col('st_total_asin_counts'), 4)
).withColumn(
'st_self_asin_proportion',
F.round(F.col('st_self_asin_counts') / F.col('st_total_asin_counts'), 4)
'st_dd200_proportion', F.round(F.col('st_dd200_counts') / F.col('st_total_asin_counts'), 4)
)
df = df.join(
df_st_asin_agg, on=['search_term'], how='left'
......@@ -708,6 +729,7 @@ class DwdStMeasure(Templates):
).join(
df_st_flow_proportion_matrix, on=['search_term'], how='left'
)
print(f"{'===' * 10}, self.df_save_st: {self.df_save_st.count()}")
self.df_save_st.show(10, truncate=False)
def handle_st_num(self):
......
......@@ -181,7 +181,10 @@ class DwtAbaStAnalytics(Templates):
st_zr_counts,
st_sp_counts,
st_self_asin_counts,
st_self_asin_proportion
st_self_asin_proportion,
st_dd50_proportion,
st_dd100_proportion,
st_dd200_proportion
from dwd_st_measure
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
......@@ -903,7 +906,10 @@ class DwtAbaStAnalytics(Templates):
"st_self_asin_proportion",
"lang",
"asin_movie_type_count",
"is_hidden_cate"
"is_hidden_cate",
"st_dd50_proportion",
"st_dd100_proportion",
"st_dd200_proportion"
)
# 空值处理
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F, Window
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
class DwtAiAsinAdd(object):
def __init__(self, site_name="us", date_type="month", date_info="2024-10"):
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
self.spark = SparkUtil.get_spark_session(app_name)
# 近6个月list
self.last_6_month = []
for i in range(0, 6):
self.last_6_month.append(CommonUtil.get_month_offset(self.date_info, -i))
self.df_base_asin = self.spark.sql(f"select 1+1;")
self.df_flow_asin_detail = self.spark.sql(f"select 1+1;")
self.df_fb_info = self.spark.sql(f"select 1+1;")
self.df_ods_asin_detail = self.spark.sql(f"select 1+1;")
self.df_ai_asin_detail = self.spark.sql(f"select 1+1;")
self.df_asin_bought_flag = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
def run(self):
self.read_data()
self.handle_data()
self.save_data()
def read_data(self):
# 读取ASIN信息库基础数据
sql1 = f"""
select
asin,
asin_bought_month,
asin_bought_mom,
asin_bought_yoy,
asin_bought_month_flag,
asin_is_new_flag
from dwd_ai_asin_add
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}'
"""
self.df_base_asin = self.spark.sql(sqlQuery=sql1).repartition(40, 'asin').cache()
print("ASIN信息库基础数据如下:")
self.df_base_asin.show(10, truncate=True)
# 读取流量选品详情数据
sql2 = f"""
select
asin,
asin_weight,
asin_category_desc,
asin_img_url,
asin_title,
asin_brand_name,
account_name,
asin_buy_box_seller_type,
asin_launch_time,
asin_img_num,
case when variation_num > 0 then 1 else 0 end as variation_flag,
variation_num,
asin_ao_val,
category_first_id,
category_id,
parent_asin,
first_category_rank,
asin_price,
asin_rating,
asin_total_comments,
asin_launch_time_type,
asin_describe
from dwt_flow_asin
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}'
and asin_type in (0, 1)
and asin_bought_month >= 50
"""
self.df_flow_asin_detail = self.spark.sql(sqlQuery=sql2).repartition(40, 'asin').cache()
print("流量选品详情数据如下:")
self.df_flow_asin_detail.show(10, truncate=True)
# 读取店铺数据
sql3 = f"""
select
account_name,
seller_id,
fb_country_name,
business_addr
from dwt_fb_base_report
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}'
"""
self.df_fb_info = self.spark.sql(sqlQuery=sql3).dropDuplicates(['account_name']).cache()
print("店铺详情数据如下:")
self.df_fb_info.show(10, truncate=True)
# 读取review_json_list等详情数据
sql4 = f"""
select
asin,
review_json_list,
product_json,
product_detail_json,
updated_at
from ods_asin_detail
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}'
"""
self.df_ods_asin_detail = self.spark.sql(sqlQuery=sql4)
window = Window.partitionBy(['asin']).orderBy(
self.df_ods_asin_detail.updated_at.desc_nulls_last()
)
self.df_ods_asin_detail = self.df_ods_asin_detail.withColumn(
'rank', F.row_number().over(window=window)
).filter('rank = 1').drop('rank', 'updated_at').repartition(40, 'asin').cache()
print("ods详情数据如下:")
self.df_ods_asin_detail.show(10, truncate=True)
# df对象join聚合
self.df_ai_asin_detail = self.df_base_asin.join(
self.df_flow_asin_detail, 'asin', 'left'
).join(
self.df_ods_asin_detail, 'asin', 'left'
).join(
self.df_fb_info, 'account_name', 'left'
).cache()
self.df_base_asin.unpersist()
self.df_flow_asin_detail.unpersist()
self.df_fb_info.unpersist()
self.df_ods_asin_detail.unpersist()
# 读取dwd_ai_asin_add月销标识
sql5 = f"""
select
asin,
asin_bought_month_flag
from dwd_ai_asin_add
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info in ({CommonUtil.list_to_insql(self.last_6_month)})
"""
self.df_asin_bought_flag = self.spark.sql(sqlQuery=sql5).repartition(40, 'asin').cache()
print("dwd_ai_asin_add月销标识数据如下:")
self.df_asin_bought_flag.show(10, truncate=True)
def handle_data(self):
# 计算上升产品标识:连续6个月销量上升
self.df_asin_bought_flag = self.df_asin_bought_flag.groupBy('asin').agg(
F.sum(F.when(F.col('asin_bought_month_flag') == 1, 1).otherwise(0)).alias('sum_flag')
).withColumn(
'is_ascending_flag', F.when(F.col('sum_flag') == 6, 1).otherwise(0)
)
def save_data(self):
# 字段标准化
self.df_save = self.df_ai_asin_detail.join(
self.df_asin_bought_flag, 'asin', 'left'
).select(
F.col("asin"),
F.col("asin_weight").alias("weight"),
F.col("asin_bought_month").alias("bought_month"),
F.col("asin_category_desc").alias("category"),
F.col("asin_img_url").alias("img"),
F.col("asin_title").alias("title"),
F.col("asin_brand_name").alias("brand"),
F.col("account_name"),
F.col("business_addr").alias("account_addr"),
F.col("asin_buy_box_seller_type").alias("buy_box_seller_type"),
F.col("asin_launch_time").alias("launch_time"),
F.col("asin_img_num").alias("img_num"),
F.col("variation_flag"),
F.col("variation_num"),
F.col("asin_ao_val").alias("ao_val"),
F.col("category_first_id").alias("category_id"),
F.col("category_id").alias("category_current_id"),
F.col("parent_asin"),
F.col("first_category_rank").alias("bsr_rank"),
F.col("asin_price").alias("price"),
F.col("asin_rating").alias("rating"),
F.col("asin_total_comments").alias("total_comments"),
F.col("seller_id"),
F.col("fb_country_name"),
F.col("review_json_list"),
F.col("asin_launch_time_type").alias("launch_time_type"),
F.col("asin_describe").alias("describe"),
F.col("product_json"),
F.col("product_detail_json"),
F.col("asin_bought_mom").alias("bought_month_mom"),
F.col("asin_bought_yoy").alias("bought_month_yoy"),
F.col("asin_is_new_flag").alias("is_new_flag"),
F.col("is_ascending_flag"),
F.lit(self.site_name).alias("site_name"),
F.lit(self.date_type).alias("date_type"),
F.lit(self.date_info).alias("date_info")
).repartition(100).cache()
# 数据存储
partition_by = ["site_name", "date_type", "date_info"]
hive_tb = "dwt_ai_asin_add"
hdfs_path = CommonUtil.build_hdfs_path(
hive_tb,
partition_dict={
"site_name": self.site_name,
"date_type": self.date_type,
"date_info": self.date_info,
}
)
HdfsUtils.delete_file_in_folder(hdfs_path)
print(f"正在进行数据存储,当前存储的表名为:{hive_tb},存储路径:{hdfs_path}")
self.df_save.write.saveAsTable(name=hive_tb, format='hive', mode='append', partitionBy=partition_by)
print("success!")
if __name__ == "__main__":
site_name = sys.argv[1]
date_type = sys.argv[2]
date_info = sys.argv[3]
handle_obj = DwtAiAsinAdd(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
sys.path.append(os.path.dirname(sys.path[0]))
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from utils.common_util import CommonUtil
......@@ -17,40 +18,37 @@ class DwtAmazonReport(Templates):
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
self.db_save = f'dwt_amazon_report'
self.date_info_pre = CommonUtil.get_month_offset(self.date_info, -1)
self.db_save = 'dwt_amazon_report'
self.spark = self.create_spark_object(
app_name=f"{self.db_save}: {self.site_name}, {self.date_type}, {self.date_info}")
self.reset_partitions(partitions_num=120)
self.reset_partitions(partitions_num=200)
self.partitions_by = ['site_name', 'date_type', 'date_info']
self.df_dwd_new = self.spark.sql(f"select 1+1;")
self.df_dwd_old = self.spark.sql(f"select 1+1;")
self.df_asin_detail_new = self.spark.sql(f"select 1+1;")
self.df_asin_detail_old = self.spark.sql(f"select 1+1;")
self.df_joined = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
def read_data(self):
# 从dwd层读取本月数据
# 读取流量选品本月月销数据
sql1 = f"""
select
asin,
monthly_sales as new_monthly_sales,
zr_count as new_zr_count,
sp_count as new_sp_count,
total_count as new_total_count,
asin_bought_month as new_monthly_sales,
asin_zr_counts as new_zr_count,
asin_sp_counts as new_sp_count,
asin_st_counts as new_total_count,
date_info as new_date_info_list
from
dwd_amazon_report
where
site_name = '{self.site_name}'
from dwt_flow_asin
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}';
"""
print(sql1)
self.df_dwd_new = self.spark.sql(sqlQuery=sql1).repartition(15, 'asin').cache()
self.df_dwd_new.show(10, truncate=True)
self.df_asin_detail_new = self.spark.sql(sqlQuery=sql1).repartition(15, 'asin').fillna('-1').cache()
self.df_asin_detail_new.show(10, truncate=True)
# 从dwt层读取上月数据
date_info_pre = CommonUtil.get_month_offset(self.date_info, -1)
# 读历史数据
sql2 = f"""
select
asin,
......@@ -59,25 +57,19 @@ class DwtAmazonReport(Templates):
sp_count as old_sp_count,
total_count as old_total_count,
date_info_list as old_date_info_list
from
dwt_amazon_report
where
site_name = '{self.site_name}'
from dwt_amazon_report
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{date_info_pre}';
and date_info = '{self.date_info_pre}';
"""
print(sql2)
self.df_dwd_old = self.spark.sql(sqlQuery=sql2).repartition(15, 'asin').cache()
self.df_dwd_old.show(10, truncate=True)
self.df_asin_detail_old = self.spark.sql(sqlQuery=sql2).repartition(15, 'asin').cache()
self.df_asin_detail_old.show(10, truncate=True)
def handle_data(self):
hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwt/{self.db_save}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
print(f"清除hdfs目录中.....{hdfs_path}")
HdfsUtils.delete_hdfs_file(hdfs_path)
# 关联后的列名
join_columns = ['monthly_sales', 'zr_count', 'sp_count', 'total_count', 'date_info_list']
# 获取历史df对象中,date_info的数量,用来确定关联不到的历史asin填充多少个 -1
old_date_info_first = self.df_dwd_old.select('old_date_info_list').distinct().first()
old_date_info_first = self.df_asin_detail_old.select('old_date_info_list').distinct().first()
if old_date_info_first is None:
old_date_info_list = None
old_date_info_list_len = 0
......@@ -88,9 +80,9 @@ class DwtAmazonReport(Templates):
# 本月数据如果关联不上,填充一个 -1
fillna_new = '-1'
# 关联df,并填充null值
self.df_joined = self.df_dwd_new.join(
self.df_dwd_old, on='asin', how='full'
)
self.df_joined = self.df_asin_detail_new.join(
self.df_asin_detail_old, on='asin', how='full'
).cache()
for col in join_columns:
self.df_joined = self.df_joined.fillna({'old_' + col: fillna_old})
self.df_joined = self.df_joined.fillna({'new_' + col: fillna_new})
......@@ -104,25 +96,17 @@ class DwtAmazonReport(Templates):
if old_date_info_first is None:
for col in join_columns:
self.df_joined = self.df_joined.withColumn(
col,
self.df_joined['new_' + col]
col, self.df_joined['new_' + col]
)
else:
for col in join_columns:
self.df_joined = self.df_joined.withColumn(
col,
concat_ws(',', self.df_joined['old_' + col], self.df_joined['new_' + col])
col, concat_ws(',', self.df_joined['old_' + col], self.df_joined['new_' + col])
)
# 选择需要的列
selected_columns = ['asin'] + join_columns
self.df_save = self.df_joined.select(selected_columns)
self.df_save = self.df_save.withColumn(
"weekly_sales", F.lit(None)
).withColumn(
"weekly_views", F.lit(None)
).withColumn(
"monthly_views", F.lit(None)
).withColumn(
"site_name", F.lit(self.site_name)
).withColumn(
"date_type", F.lit(self.date_type)
......@@ -130,14 +114,14 @@ class DwtAmazonReport(Templates):
"date_info", F.lit(self.date_info)
)
hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwt/{self.db_save}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
print(f"清除hdfs目录中.....{hdfs_path}")
HdfsUtils.delete_hdfs_file(hdfs_path)
if __name__ == '__main__':
site_name = sys.argv[1]
date_type = sys.argv[2]
date_info = sys.argv[3]
if (site_name in ['us', 'uk', 'de']) and (date_type == 'month') and (date_info >= '2024-04'):
handle_obj = DwtAmazonReport(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
else:
print("暂不计算该维度数据!")
quit()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from pyspark.sql import functions as F
from utils.spark_util import SparkUtil
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
class DwtAsinRelatedTraffic(object):
def __init__(self, site_name, date_type, date_info):
super().__init__()
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
self.hive_tb = f'dwt_asin_related_traffic'
self.partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
self.hdfs_path = CommonUtil.build_hdfs_path(self.hive_tb, partition_dict=self.partition_dict)
app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
self.spark = SparkUtil.get_spark_session(app_name)
self.partitions_by = ['site_name', 'date_type', 'date_info']
self.df_dim_asin_related_traffic = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
self.col_num_index = {
"four_star_above": 1,
"brand_recommendation": 2,
"similar_items": 3,
"look_and_look": 4,
"look_also_look": 5,
"look_but_bought": 6,
"bundle_bought": 7,
"combination_bought": 8,
"more_relevant": 9,
"bought_and_bought": 10,
"product_adv": 11,
"brand_adv": 12
}
def read_data(self):
print("读取dim_asin_related_traffic流量数据")
sql = f"""
select
asin,
four_star_above,
brand_recommendation,
similar_items,
look_and_look,
look_also_look,
look_but_bought,
bundle_bought,
combination_bought,
more_relevant,
bought_and_bought,
product_adv,
brand_adv,
updated_at as related_time
from dim_asin_related_traffic where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}';
"""
self.df_dim_asin_related_traffic = self.spark.sql(sqlQuery=sql).cache()
print("dim_asin_related_traffic数据如下:")
self.df_dim_asin_related_traffic.show(10, True)
# 聚合计算
def handle_data(self):
cols = [col for col in self.df_dim_asin_related_traffic.columns if col != 'asin' and col != 'related_time']
for col in cols:
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
col, F.concat_ws(",", F.filter(F.split(F.col(col), ","), lambda x: (F.length(F.trim(x)) == 10)))
).withColumn(
col, F.when(F.col(col) == "", None).otherwise(F.col(col))
)
# 将所有类型下的关联流量asin拼接
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
"related_asin", F.concat_ws(",", *[F.col(col) for col in cols])
)
# 根据map映射 生成与流量asin数量相等的编号列
for col in cols:
num = self.col_num_index[col]
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
f"{col}_num", F.when(
F.col(col).isNull(), F.lit(None)
).otherwise(
F.concat_ws(",", F.array_repeat(F.lit(num), F.size(F.split(F.col(col), ","))))
)
)
# 将所有编号列进行拼接
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
"related_type", F.concat_ws(",", *[F.col(f"{col}_num") for col in cols])
).cache()
# 统计关联数
df_related = self.df_dim_asin_related_traffic.select(
'asin', F.explode(F.split(F.col('related_asin'), ',')).alias('related_asin')
).drop_duplicates(['asin', 'related_asin']).groupBy(
'related_asin'
).agg(
F.count('asin').alias('related_count')
).withColumnRenamed(
'related_asin', 'asin'
)
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.join(
df_related, on='asin', how='left'
).fillna({
'related_count': 0
})
# 数据落盘
def save_data(self):
self.df_save = self.df_dim_asin_related_traffic.select(
'asin',
'related_asin',
'related_type',
'related_time',
'related_count',
F.lit(self.site_name).alias('site_name'),
F.lit(self.date_type).alias('date_type'),
F.lit(self.date_info).alias('date_info')
)
print(f"清除hdfs目录中:{self.hdfs_path}")
HdfsUtils.delete_file_in_folder(self.hdfs_path)
print(f"当前存储的表名为:{self.hive_tb},分区为:{self.partitions_by}")
self.df_save.repartition(40).write.saveAsTable(name=self.hive_tb, format='hive', mode='append', partitionBy=self.partitions_by)
print("success")
def run(self):
# 读取数据
self.read_data()
# 聚合计算
self.handle_data()
# 数据落盘
self.save_data()
if __name__ == '__main__':
site_name = sys.argv[1]
date_type = sys.argv[2]
date_info = sys.argv[3]
handle_obj = DwtAsinRelatedTraffic(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
......@@ -65,33 +65,62 @@ class DwtFbBaseReport(object):
# 初始化UDF函数
self.udf_new_asin_flag = F.udf(udf_new_asin_flag, IntegerType())
self.u_judge_package_quantity = F.udf(udf_get_package_quantity, IntegerType())
self.u_get_business_val = F.udf(self.get_business_val, StringType())
# 解析seller_address字段,获取卖家公司数据
@staticmethod
def get_business_val(seller_address, key):
if not seller_address:
return None
parts = [p.strip() for p in seller_address.split("|-|")]
for i, p in enumerate(parts):
if p.startswith(key):
# Business Address: 拼接后续所有内容
if key in ("Business Address", "Geschäftsadresse"):
return " ".join(parts[i + 1:]).strip()
# 其他key: 只取下一个
elif i + 1 < len(parts):
return parts[i + 1].strip()
return None
def read_data(self):
# ods_seller_account_feedback 月度店铺报告表主表
print("获取 ods_seller_account_feedback")
sql = f"""select cur_fd.seller_id,
sql = f"""
select
cur_fd.seller_id,
cur_fd.fb_web_asin_num,
cur_fd.fb_country_name,
cur_fd.count_30_day_num,
cur_fd.count_1_year_num,
cur_fd.count_lifetime_num,
cur_fd.seller_address,
cur_fd.fb_crawl_date,
round((count_30_day_num - last_30_day_num) / last_30_day_num, 4) as count_30_day_rate,
round((count_1_year_num - last_1_year_num) / last_1_year_num, 4) as count_1_year_rate,
round((count_lifetime_num - last_lifetime_num) / last_lifetime_num, 4) as count_life_time_rate
from (select seller_id,
from
(
select
seller_id,
num as fb_web_asin_num,
count_30_day as count_30_day_num,
count_1_year as count_1_year_num,
count_lifetime as count_lifetime_num,
country_name as fb_country_name,
seller_address,
date_format(updated_at, 'yyyy-MM-dd HH:mm:ss') as fb_crawl_date
from ods_seller_account_feedback
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}'
and length(seller_id) > 2 ) cur_fd
left join (select seller_id,
and length(seller_id) > 2
) cur_fd
left join
(
select
seller_id,
count_30_day as last_30_day_num,
count_1_year as last_1_year_num,
count_lifetime as last_lifetime_num
......@@ -99,8 +128,10 @@ from (select seller_id,
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.last_month}'
and length(seller_id) > 2 ) last_fd
on cur_fd.seller_id = last_fd.seller_id"""
and length(seller_id) > 2
) last_fd
on cur_fd.seller_id = last_fd.seller_id
"""
self.df_fb_feedback = self.spark.sql(sqlQuery=sql)
self.df_fb_feedback = self.df_fb_feedback.drop_duplicates(['seller_id']).cache()
print(sql)
......@@ -108,9 +139,8 @@ from (select seller_id,
# 获取我们内部的店铺与asin的数据库(从搜索词抓下来,店铺与asin的关系表)
print("获取 ods_seller_asin_account")
sql = f"""
select seller_id,asin from ods_seller_asin_account
where site_name='{self.site_name}'
and date_format(created_at,'yyyy-MM-dd') <= '{self.cal_date}'
select seller_id, asin from ods_seller_asin_account
where site_name='{self.site_name}' and date_format(created_at,'yyyy-MM-dd') <= '{self.cal_date}'
"""
self.df_fb_asin = self.spark.sql(sqlQuery=sql)
self.df_fb_asin = self.df_fb_asin.drop_duplicates(['seller_id', 'asin'])
......@@ -157,16 +187,17 @@ from (select seller_id,
# 获取ods_asin_variat提取parent_asin用于计算是多变体
print("获取 dim_asin_variation_info")
sql = f"select asin,parent_asin from dim_asin_variation_info " \
f"where site_name='{self.site_name}'" \
f" and asin != parent_asin "
sql = f"""
select asin, parent_asin from dim_asin_variation_info where site_name='{self.site_name}' and asin != parent_asin
"""
self.df_asin_parent = self.spark.sql(sqlQuery=sql)
print(sql)
# 获取ods_seller_account_syn提取account_name
print("获取 ods_seller_account_syn")
sql = f"select seller_id,account_name,id from ods_seller_account_syn " \
f"where site_name='{self.site_name}'"
sql = f"""
select seller_id, account_name, id from ods_seller_account_syn where site_name='{self.site_name}'
"""
self.df_seller_account = self.spark.sql(sqlQuery=sql)
# 进行去重
self.df_seller_account = self.df_seller_account.orderBy(self.df_seller_account.id.desc())
......@@ -177,8 +208,7 @@ from (select seller_id,
# 获取mysql:selection.accounts ,用于排除公司内部店铺
print("获取 selection.accounts")
sql = f"""
select seller_id, 1 as is_self_fb from
(select distinct seller_id from selection.accounts) t1
select seller_id, 1 as is_self_fb from (select distinct seller_id from selection.accounts) t1
"""
conn_info = DBUtil.get_connection_info("mysql", "us")
self.df_self_seller_id = SparkUtil.read_jdbc_query(
......@@ -291,6 +321,22 @@ from (select seller_id,
# 没有关联上的赋值为0,则不是公司内部店铺
self.df_fb_agg = self.df_fb_agg.na.fill({"is_self_fb": 0})
# 解析卖家公司数据,不同语言区别处理
if self.site_name in ("us", "uk"):
self.df_fb_agg = self.df_fb_agg.withColumn(
"business_name", self.u_get_business_val(F.col("seller_address"), F.lit("Business Name"))
).withColumn(
"business_addr", self.u_get_business_val(F.col("seller_address"), F.lit("Business Address"))
)
elif self.site_name == "de":
self.df_fb_agg = self.df_fb_agg.withColumn(
"business_name", self.u_get_business_val(F.col("seller_address"), F.lit("Geschäftsname"))
).withColumn(
"business_addr", self.u_get_business_val(F.col("seller_address"), F.lit("Geschäftsadresse"))
)
else:
pass
# 输出数据集-report
def save_data_report(self):
# 关联ods_seller_account_syn,带回account_name-采用inner join过滤掉库中无店铺名称的数据
......@@ -358,6 +404,8 @@ from (select seller_id,
F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS').alias('updated_time'),
F.lit(None).alias('usr_mask_type'),
F.lit(None).alias('usr_mask_progress'),
F.col('business_name'),
F.col('business_addr'),
F.lit(self.site_name).alias('site_name'),
F.lit(self.date_type).alias('date_type'),
F.lit(self.date_info).alias('date_info')
......
......@@ -56,7 +56,11 @@ class DwtFlowAsin(Templates):
self.df_save = self.spark.sql(f"select 1+1;")
self.partitions_by = ['site_name', 'date_type', 'date_info']
self.reset_partitions(60)
self.launch_time_interval_dict = self.get_launch_time_interval_dict()
launch_time_base_date = self.spark.sql(
f"""SELECT max(`date`) AS last_day FROM dim_date_20_to_30 WHERE year_month = '{self.date_info}'"""
).collect()[0]['last_day']
self.launch_time_interval_dict = self.get_launch_time_interval_dict(launch_time_base_date)
# 初始化全局df
self.df_asin_detail = self.spark.sql(f"select 1+1;")
self.df_asin_measure = self.spark.sql(f"select 1+1;")
......@@ -123,15 +127,15 @@ class DwtFlowAsin(Templates):
return str(datetime.now().date())
@staticmethod
def get_launch_time_interval_dict():
cur_date = datetime.now().date()
def get_launch_time_interval_dict(base_date):
base_date = datetime.strptime(base_date, '%Y-%m-%d')
return {
"one_month": (cur_date + timedelta(days=-30)).strftime('%Y-%m-%d'),
"three_month": (cur_date + timedelta(days=-90)).strftime('%Y-%m-%d'),
"six_month": (cur_date + timedelta(days=-180)).strftime('%Y-%m-%d'),
"twelve_month": (cur_date + timedelta(days=-360)).strftime('%Y-%m-%d'),
"twenty_four_month": (cur_date + timedelta(days=-720)).strftime('%Y-%m-%d'),
"thirty_six_month": (cur_date + timedelta(days=-1080)).strftime('%Y-%m-%d')
"one_month": (base_date + timedelta(days=-30)).strftime('%Y-%m-%d'),
"three_month": (base_date + timedelta(days=-90)).strftime('%Y-%m-%d'),
"six_month": (base_date + timedelta(days=-180)).strftime('%Y-%m-%d'),
"twelve_month": (base_date + timedelta(days=-360)).strftime('%Y-%m-%d'),
"twenty_four_month": (base_date + timedelta(days=-720)).strftime('%Y-%m-%d'),
"thirty_six_month": (base_date + timedelta(days=-1080)).strftime('%Y-%m-%d')
}
@staticmethod
......@@ -167,7 +171,7 @@ class DwtFlowAsin(Templates):
date_format(created_time, 'yyyy-MM-dd HH:mm:ss') as asin_crawl_date, asin_bought_month, asin_image_view,
case when product_description is not null then 1 else 0 end as is_with_product_description, asin_describe,
category_id as top_category_id, category_first_id as top_category_first_id, customer_reviews_json, img_list as img_info,
asin_follow_sellers as follow_sellers_count
asin_follow_sellers as follow_sellers_count, asin_fbm_price
from dim_asin_detail where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}'"""
print("sql:" + sql)
self.df_asin_detail = self.spark.sql(sqlQuery=sql)
......@@ -295,7 +299,10 @@ class DwtFlowAsin(Templates):
WHEN asin_price >= 15 AND asin_price < 20 THEN 3
WHEN asin_price >= 20 AND asin_price < 30 THEN 4
WHEN asin_price >= 30 AND asin_price < 50 THEN 5
WHEN asin_price >= 50 THEN 6 ELSE 0 END"""))
WHEN asin_price >= 50 AND asin_price < 70 THEN 6
WHEN asin_price >= 70 AND asin_price < 100 THEN 7
WHEN asin_price >= 100 AND asin_price < 150 THEN 8
WHEN asin_price >= 150 THEN 9 ELSE 0 END"""))
# 处理asin分类、排名、排名类型字段、是否有效排名信息
def handle_asin_category_info(self):
......@@ -396,7 +403,7 @@ class DwtFlowAsin(Templates):
"img_num_rating") + F.col("img_enlarge_rating")
)
self.df_asin_detail = self.df_asin_detail.\
drop("is_with_product_description", "asin_describe", "asin_image_view", "category_node_rating", "zr_rating",
drop("is_with_product_description", "asin_image_view", "category_node_rating", "zr_rating",
"sp_rating", "a_add_rating", "video_rating", "brand_rating", "product_describe_rating",
"highlight_rating", "title_len_rating", "title_brand_rating", "img_num_rating", "img_enlarge_rating")
......@@ -500,7 +507,7 @@ class DwtFlowAsin(Templates):
F.lit(None).alias("buy_data_viewed_month"), F.lit(None).alias("buy_data_viewed_week"),
F.lit(None).alias("theme_en"), F.lit(None).alias("theme_label_en"), "asin_lqs_rating",
"asin_lqs_rating_detail", "title_matching_degree", "zr_flow_proportion", "matrix_flow_proportion",
"matrix_ao_val", "follow_sellers_count", "seller_json",
"matrix_ao_val", "follow_sellers_count", "seller_json", "asin_describe", "asin_fbm_price",
F.lit(self.site_name).alias("site_name"), F.lit(self.date_type).alias("date_type"),
F.lit(self.date_info).alias("date_info"))
self.df_save = self.df_save.na.fill(
......@@ -514,7 +521,7 @@ class DwtFlowAsin(Templates):
"package_quantity": 1, "is_movie_label": 0, "is_brand_label": 0, "is_alarm_brand": 0,
"title_matching_degree": 0.0, "asin_lqs_rating": 0.0, "follow_sellers_count": -1})
self.df_save = self.df_save.repartition(60).persist(StorageLevel.DISK_ONLY)
self.df_save = self.df_save.drop_duplicates(['asin']).filter(F.length(F.col("asin"))<=10)
self.df_save = self.df_save.drop_duplicates(['asin']).filter((F.col("asin").isNotNull()) & (F.col("asin") != "") & (F.length(F.col("asin")) <= 10))
print("数据量为:", self.df_save.count())
self.df_save.show(10, truncate=False)
......@@ -550,14 +557,14 @@ class DwtFlowAsin(Templates):
F.col("current_category_rank").alias("category_current_rank"), "asin_type",
"bsr_orders", F.col("sales").alias("bsr_orders_sale"),
F.col("asin_page_inventory").alias("page_inventory"), "asin_bought_month", "seller_json",
F.col("asin_buy_box_seller_type").alias("buy_box_seller_type")
F.col("asin_buy_box_seller_type").alias("buy_box_seller_type"), "asin_describe", "asin_fbm_price"
)
table_columns = """asin, asin_ao_val, asin_title, asin_title_len, asin_category_desc, asin_volume,
asin_weight, asin_launch_time, asin_brand_name, one_star, two_star, three_star, four_star, five_star, low_star,
account_name, account_id, seller_country_name, category_first_id, parent_asin, variation_num, img_info,
asin_crawl_date, asin_price, asin_rating, asin_total_comments, matrix_ao_val, zr_flow_proportion, matrix_flow_proportion,
date_info, img_url, category_current_id, category_first_rank, category_current_rank, asin_type, bsr_orders, bsr_orders_sale,
page_inventory, asin_bought_month, seller_json, buy_box_seller_type"""
page_inventory, asin_bought_month, seller_json, buy_box_seller_type, asin_describe, asin_fbm_price"""
DorisHelper.spark_export_with_columns(df_save=df_doris, db_name=self.doris_db, table_name=self.asin_latest_detail_table, table_columns=table_columns)
print("save asin_latest_detail success")
else:
......
......@@ -95,18 +95,18 @@ def handle_new_store_collections(new_collect_store_id):
except:
pass
print("推送失败")
CommonUtil.send_wx_msg(['wangrui4'], f"\u26A0店铺收藏消息推送失败\u26A0", f"任务信息: {cmd} 请注意检查!")
CommonUtil.send_wx_msg(['chenyuanjie'], f"\u26A0店铺收藏消息推送失败\u26A0", f"任务信息: {cmd} 请注意检查!")
else:
print("dwt执行失败")
print("错误信息为:============")
print(error.decode())
CommonUtil.send_wx_msg(['wangrui4'], f"\u26A0店铺收藏更新失败\u26A0", f"任务信息: {cmd} 请注意检查!")
CommonUtil.send_wx_msg(['chenyuanjie'], f"\u26A0店铺收藏更新失败\u26A0", f"任务信息: {cmd} 请注意检查!")
else:
print("dws执行失败")
print("错误信息为:============")
print(error.decode())
CommonUtil.send_wx_msg(['wangrui4'], f"\u26A0店铺收藏更新失败\u26A0", f"任务信息: {cmd} 请注意检查!")
CommonUtil.send_wx_msg(['chenyuanjie'], f"\u26A0店铺收藏更新失败\u26A0", f"任务信息: {cmd} 请注意检查!")
if __name__ == '__main__':
......
......@@ -81,5 +81,82 @@ def handle_calc():
print("success")
def handle_calc_new():
day_end = CommonUtil.format_now("%Y-%m-%d")
CommonUtil.orctable_concatenate(
hive_table="dim_st_pcp_history",
partition_dict={
"date_info": CommonUtil.get_day_offset(day_end, -1)
},
innerFlag=False,
min_part_num=10,
max_retry_time=5
)
spark = SparkUtil.get_spark_session("dwt_st_pcp_current")
day_start = CommonUtil.get_day_offset(day_end, -90)
df_all = spark.sql(f"""
select site_id,
group_id,
keyword_id,
keyword,
match_type,
created_at,
min_bid,
max_bid,
suggested_bid,
date_info
from dim_st_pcp_history
where date_info >= '{day_start}'
and date_info <= '{day_end}'
""")
window = Window.partitionBy(['site_id', 'match_type', 'keyword'])
df_all = df_all.where("site_id is not null and created_at is not null")
# 去重
df_all = df_all.dropDuplicates(['site_id', 'match_type', 'keyword', 'date_info'])
# 获取最小的那天
df_save = df_all.withColumn("day_row_number",
F.row_number().over(window.orderBy(F.col("date_info").desc())))
df_save = df_save.where("day_row_number == 1")
# 取最近的一天中的最小建议竞价的那一行作为过滤值
df_save = df_save.withColumn("min_row_number",
F.row_number().over(window.orderBy(F.col("suggested_bid").asc())))
df_save = df_save.where("min_row_number == 1")
df_history = df_all.groupby([F.col("site_id"), F.col("keyword"), F.col("match_type")]).agg(
F.collect_list(F.struct(F.col("min_bid"), F.col("max_bid"), F.col("suggested_bid"), F.col("created_at"))).alias("list")
)
df_history = df_history.withColumn("history_json",
F.when(F.size(F.col("list")) <= 1, F.lit(None)).otherwise(F.to_json(F.col("list"))))
df_save = df_save.join(df_history, on=['site_id', 'keyword', 'match_type'], how='left').select(
df_save['site_id'],
F.col('group_id'),
F.col('keyword_id'),
df_save['keyword'],
df_save['match_type'],
F.col('created_at'),
F.col('min_bid'),
F.col('max_bid'),
F.col('suggested_bid'),
F.col('history_json'),
F.lit("90").alias("day")
)
# 更新
CommonUtil.save_or_update_table(
spark_session=spark,
hive_tb_name="dwt_st_pcp_current_v2",
partition_dict={
"day": "90"
},
df_save=df_save
)
print("success")
if __name__ == '__main__':
handle_calc()
handle_calc_new()
......@@ -42,10 +42,13 @@ class DwtThemeBsOrders(Templates):
# f"union all " \
# f"select asin, asin_title, bsr_orders, date_info from dwt_flow_asin where site_name='{self.site_name}' and " \
# f"date_type='month' and date_info in ('2023-01', '2023-02', '2023-03', '2023-04', '2023-05', '2023-06');"
sql = f"select asin, asin_title, bsr_orders, date_info from dwt_flow_asin where site_name='{self.site_name}' and " \
f"date_type='month' and date_info >= '2023-01' and date_info <= '2023-12';"
# sql = f"select asin, asin_title, bsr_orders, date_info from dwt_flow_asin where site_name='{self.site_name}' and " \
# f"date_type='month' and date_info >= '2023-01' and date_info <= '2023-12';"
# f"date_type='month' and date_info >= '2023-01' and date_info <= '2023-01' limit 1000000;"
sql = f"select asin, asin_title, bsr_orders, date_info from dwt_flow_asin where site_name='{self.site_name}' and " \
f"date_type='month' and date_info >= '2024-08' and date_info <= '2025-07';"
print("sql:", sql)
self.df_flow = self.spark.sql(sql).cache()
self.df_flow.show(10, truncate=False)
......@@ -186,7 +189,8 @@ class DwtThemeBsOrders(Templates):
self.df_save = self.df_save.cache()
self.df_save.show(50, truncate=False)
df = self.df_save.toPandas()
df.to_csv("/root/theme_new_2023.csv", index=False)
# df.to_csv("/root/theme_new_2023.csv", index=False)
df.to_csv("/home/fangxingjun/theme_new_202408-202507.csv", index=False)
if __name__ == '__main__':
......
......@@ -90,11 +90,11 @@ class DwtTop100(Templates):
def save_data(self):
self.df_save = self.df_flow_asin
self.df_save = self.df_save.toPandas()
self.df_save.to_csv(f"/root/asin_bsr_{self.site_name}_{self.date_info}.csv", index=False)
self.df_save.to_csv(f"/home/fangxingjun/asin_bsr_{self.site_name}_{self.date_info}.csv", index=False)
def save_data_old(self):
self.df_save = self.df_save.toPandas()
self.df_save.to_csv(f"/root/asin_bsr_{self.site_name}_{self.date_info}.csv", index=False)
self.df_save.to_csv(f"/home/fangxingjun/asin_bsr_{self.site_name}_{self.date_info}.csv", index=False)
if __name__ == '__main__':
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from utils.db_util import DBUtil
from utils.common_util import CommonUtil
from utils.DorisHelper import DorisHelper
from pyspark.sql import functions as F
__es_ip__ = "192.168.10.217"
__es_port__ = "9200"
__es_user__ = "elastic"
__es_passwd__ = "Selection20251#+"
class EsUpdate(object):
def __init__(self, site_name, date_info):
self.site_name = site_name
self.date_info = date_info
self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")
year, month = self.date_info.split('-')
self.index_name = f"{site_name}_st_detail_month_{year}_{month}"
self.es_options = {
"es.nodes": __es_ip__,
"es.port": __es_port__,
"es.net.http.auth.user": __es_user__,
"es.net.http.auth.pass": __es_passwd__,
"es.mapping.id": "asin",
"es.resource": f"{self.index_name}/_doc",
"es.batch.write.refresh": "false",
"es.batch.write.retry.wait": "60s",
"es.batch.size.entries": "5000",
"es.nodes.wan.only": "false",
"es.batch.write.concurrency": "60",
"es.write.operation": "upsert"
}
self.df_asin = self.spark.sql(f"select 1+1;")
self.df_es_asin = self.spark.sql(f"select 1+1;")
self.df_need_update = self.spark.sql(f"select 1+1;")
def run(self):
self.get_update_asin()
self.update_es_filed()
def get_update_asin(self):
sql = f"""
select asin from {self.site_name}_asin_detail_2025_buysales_err where date_info = '{self.date_info}'
"""
pg_con_info = DBUtil.get_connection_info("postgresql_14", self.site_name)
self.df_asin = SparkUtil.read_jdbc_query(
session=self.spark,
url=pg_con_info['url'],
username=pg_con_info['username'],
pwd=pg_con_info['pwd'],
query=sql
)
self.df_asin = self.df_asin.dropDuplicates(['asin']).repartition(40, 'asin').cache()
print("爬虫表数据量为:", self.df_asin.count())
def update_es_filed(self):
es_asin_sql = f"""
SELECT asin from es_selection.default_db.{self.index_name}
"""
self.df_es_asin = DorisHelper.spark_import_with_sql(self.spark, es_asin_sql).repartition(40, 'asin')
self.df_need_update = self.df_asin.join(
self.df_es_asin, on=['asin'], how='inner'
).withColumn(
'asin_bought_month', F.lit(0)
).cache()
print("ES待更新的数据量为:", self.df_need_update.count())
print(f"正在更新ES数据,更新索引:{self.index_name}")
try:
self.df_need_update.write.format("org.elasticsearch.spark.sql") \
.options(**self.es_options) \
.mode("append") \
.save()
print(f"ES {self.index_name} 索引更新完毕!")
except Exception as e:
print("An error occurred while writing to Elasticsearch:", str(e))
CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES月销数据更新失败', f'失败索引:{self.index_name}')
if __name__ == "__main__":
site_name = sys.argv[1]
date_info = sys.argv[2]
handle_obj = EsUpdate(site_name, date_info)
handle_obj.run()
......@@ -11,7 +11,7 @@ from utils.DorisHelper import DorisHelper
__es_ip__ = "192.168.10.217"
__es_port__ = "9200"
__es_user__ = "elastic"
__es_passwd__ = "selection2021.+"
__es_passwd__ = "Selection20251#+"
__warehouse_dir__ = "hdfs://nameservice1:8020/home/big_data_selection"
__metastore_uris__ = "thrift://hadoop16:9083"
......
......@@ -17,7 +17,7 @@ from utils.DorisHelper import DorisHelper
__es_ip__ = "192.168.10.217"
__es_port__ = "9200"
__es_user__ = "elastic"
__es_passwd__ = "selection2021.+"
__es_passwd__ = "Selection20251#+"
__warehouse_dir__ = "hdfs://nameservice1:8020/home/big_data_selection"
__metastore_uris__ = "thrift://hadoop16:9083"
......@@ -81,7 +81,7 @@ def update_es_fileds(spark, df_main, date_info_list, site_name, run_type):
print(f"elasticsearch {index_name} 更新完毕!")
except Exception as e:
print("An error occurred while writing to Elasticsearch:", str(e))
CommonUtil.send_wx_msg(['wujicang', 'wangrui4'], '\u26A0 es用户标记信息更新失败', f'es更新用户标记信息失败:{site_name}, {date_info}')
CommonUtil.send_wx_msg(['wujicang', 'chenyuanjie'], '\u26A0 es用户标记信息更新失败', f'es更新用户标记信息失败:{site_name}, {date_info}')
pass
print("elasticsearch 所有数据全部更新完毕")
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from utils.es_util import EsUtils
from utils.db_util import DBUtil
from datetime import datetime, timedelta
from pyspark.sql import functions as F
class EsAiAsinAdd(object):
def __init__(self, site_name, date_type, date_info):
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")
if self.site_name == 'us':
self.pg_tb = "ai_asin_analyze_detail"
else:
self.pg_tb = f"{self.site_name}_ai_asin_analyze_detail"
launch_time_base_date = self.spark.sql(
f"""SELECT max(`date`) AS last_day FROM dim_date_20_to_30 WHERE year_month = '{self.date_info}'"""
).collect()[0]['last_day']
self.launch_time_interval_dict = self.get_launch_time_interval_dict(launch_time_base_date)
self.es_client = EsUtils.get_es_client()
self.es_index = f"{self.site_name}_ai_asin_analyze_detail_{self.date_info.replace('-', '_')}"
self.es_pipeline = f"{self.site_name}_ai_analyze_pipeline"
self.es_options = self.get_es_options(self.es_index, self.es_pipeline)
self.df_ai_asin_detail = self.spark.sql(f"select 1+1;")
self.df_ai_asin_analyze = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
@staticmethod
def get_launch_time_interval_dict(base_date):
base_date = datetime.strptime(base_date, '%Y-%m-%d')
return {
"one_month": (base_date + timedelta(days=-30)).strftime('%Y-%m-%d'),
"three_month": (base_date + timedelta(days=-90)).strftime('%Y-%m-%d'),
"six_month": (base_date + timedelta(days=-180)).strftime('%Y-%m-%d'),
"twelve_month": (base_date + timedelta(days=-360)).strftime('%Y-%m-%d'),
"twenty_four_month": (base_date + timedelta(days=-720)).strftime('%Y-%m-%d'),
"thirty_six_month": (base_date + timedelta(days=-1080)).strftime('%Y-%m-%d')
}
@staticmethod
def get_es_options(index_name, pipeline_id):
return {
"es.nodes": EsUtils.__es_ip__,
"es.port": EsUtils.__es_port__,
"es.net.http.auth.user": EsUtils.__es_user__,
"es.net.http.auth.pass": EsUtils.__es_passwd__,
"es.mapping.id": "asin",
"es.resource": f"{index_name}/_doc",
"es.batch.write.refresh": "false",
"es.batch.write.retry.wait": "60s",
"es.batch.size.entries": "5000",
"es.nodes.wan.only": "false",
"es.batch.write.concurrency": "40",
"es.write.operation": "index",
"es.ingest.pipeline": f"{pipeline_id}"
}
def run(self):
self.read_data()
self.handle_data()
self.save_data()
def read_data(self):
# 读取asin信息库月数据
sql1 = f"""
select
site_name,
asin,
weight,
bought_month,
category,
img,
title,
brand,
account_name,
account_addr,
buy_box_seller_type,
launch_time,
img_num,
variation_flag,
variation_num,
ao_val,
category_id,
category_current_id,
parent_asin,
bsr_rank,
price,
rating,
total_comments,
seller_id,
fb_country_name,
bought_month_mom,
bought_month_yoy,
is_new_flag,
is_ascending_flag
from dwt_ai_asin_add
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info = '{self.date_info}'
"""
self.df_ai_asin_detail = self.spark.sql(sqlQuery=sql1).repartition(40, 'asin').cache()
print("ASIN信息库数据如下:")
self.df_ai_asin_detail.show(10, True)
# 读取Ai分析结果
sql2 = f"""
select
asin,
id as analyze_id,
package_quantity,
material,
color,
appearance,
size,
function,
shape,
scene_title,
scene_comment,
uses,
theme,
crowd,
short_desc,
title_pic_flag,
title_word_flag,
title_pic_content,
title_word_content,
array_to_string(package_quantity_arr, ',') as package_quantity_arr,
package_quantity_flag,
label_content
from {self.pg_tb}
"""
conn_info = DBUtil.get_connection_info("postgresql", "us")
self.df_ai_asin_analyze = SparkUtil.read_jdbc_query(
session=self.spark,
url=conn_info["url"],
pwd=conn_info["pwd"],
username=conn_info["username"],
query=sql2
).withColumn(
'package_quantity_arr', F.split(F.col('package_quantity_arr'), ',')
).withColumn(
'package_quantity_arr', F.expr('transform(package_quantity_arr, x -> cast(x as int))')
).repartition(40, 'asin').cache()
print("AI分析数据如下:")
self.df_ai_asin_analyze.show(10, True)
def handle_data(self):
# 补充launch_time_type字段
one_month = self.launch_time_interval_dict['one_month']
three_month = self.launch_time_interval_dict['three_month']
six_month = self.launch_time_interval_dict['six_month']
twelve_month = self.launch_time_interval_dict['twelve_month']
twenty_four_month = self.launch_time_interval_dict['twenty_four_month']
thirty_six_month = self.launch_time_interval_dict['thirty_six_month']
expr_str = f"""
CASE WHEN launch_time >= '{one_month}' THEN 1
WHEN launch_time >= '{three_month}' AND launch_time < '{one_month}' THEN 2
WHEN launch_time >= '{six_month}' AND launch_time < '{three_month}' THEN 3
WHEN launch_time >= '{twelve_month}' AND launch_time < '{six_month}' THEN 4
WHEN launch_time >= '{twenty_four_month}' AND launch_time < '{twelve_month}' THEN 5
WHEN launch_time >= '{thirty_six_month}' AND launch_time < '{twenty_four_month}' THEN 6
WHEN launch_time < '{thirty_six_month}' THEN 7
ELSE 0 END
"""
self.df_ai_asin_detail = self.df_ai_asin_detail.withColumn('launch_time_type', F.expr(expr_str))
def save_data(self):
self.df_save = self.df_ai_asin_detail.join(
self.df_ai_asin_analyze, 'asin', 'inner'
).select(
'account_addr',
'account_name',
'analyze_id',
'ao_val',
'appearance',
'asin',
'bought_month',
'bought_month_mom',
'bought_month_yoy',
'brand',
'bsr_rank',
'buy_box_seller_type',
'category',
'category_current_id',
'category_id',
'color',
'crowd',
'fb_country_name',
'function',
'img',
'img_num',
'is_ascending_flag',
'is_new_flag',
'label_content',
'launch_time',
'launch_time_type',
'material',
'package_quantity',
'package_quantity_arr',
'package_quantity_flag',
'parent_asin',
'price',
'rating',
'scene_comment',
'scene_title',
'seller_id',
'shape',
'short_desc',
'site_name',
'size',
'theme',
'title',
'title_pic_content',
'title_pic_flag',
'title_word_content',
'title_word_flag',
'total_comments',
'uses',
'variation_flag',
'variation_num',
'weight'
).cache()
try:
self.df_save.write.format("org.elasticsearch.spark.sql") \
.options(**self.es_options) \
.mode("append") \
.save()
print(f"ES {self.es_index} 索引更新完毕!")
except Exception as e:
print("An error occurred while writing to Elasticsearch:", str(e))
CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES数据更新失败', f'失败索引:{self.es_index}')
if __name__ == "__main__":
site_name = sys.argv[1]
date_type = sys.argv[2]
date_info = sys.argv[3]
print("开始执行时间:", datetime.now().strftime("%Y-%m-%d %H:%M"))
handle_obj = EsAiAsinAdd(site_name, date_type, date_info)
handle_obj.run()
print("执行结束时间:", datetime.now().strftime("%Y-%m-%d %H:%M"))
print("success!!!")
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.spark_util import SparkUtil
from utils.es_util import EsUtils
from pyspark.sql import functions as F
from utils.common_util import CommonUtil
class EsAiAsinAll(object):
def __init__(self, site_name):
self.site_name = site_name
self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")
# ES相关配置
self.es_client = EsUtils.get_es_client()
self.es_index = f"{site_name}_ai_analyze_extra"
self.es_body = self.get_es_body()
self.es_options = self.get_es_options(self.es_index)
self.policy_name = f"{site_name}_ai_analyze_policy"
self.pipeline_id = f"{site_name}_ai_analyze_pipeline"
self.df_asin_detail = self.spark.sql(f"select 1+1;")
@staticmethod
def get_es_body():
return {
"settings": {
"number_of_shards": "3",
"number_of_replicas": "1"
},
"mappings": {
"properties": {
"asin": {
"type": "keyword"
},
"is_stable_flag": {
"type": "short"
},
"is_periodic_flag": {
"type": "short"
},
"is_ascending_flag": {
"type": "short"
},
"max_bought_month_arr": {
"type": "integer"
}
}
}
}
@staticmethod
def get_es_options(index_name):
return {
"es.nodes": EsUtils.__es_ip__,
"es.port": EsUtils.__es_port__,
"es.net.http.auth.user": EsUtils.__es_user__,
"es.net.http.auth.pass": EsUtils.__es_passwd__,
"es.mapping.id": "asin",
"es.resource": f"{index_name}/_doc",
"es.batch.write.refresh": "false",
"es.batch.write.retry.wait": "60s",
"es.batch.size.entries": "5000",
"es.nodes.wan.only": "false",
"es.batch.write.concurrency": "40",
"es.write.operation": "index"
}
def run(self):
self.read_data()
self.es_save()
self.create_enrich_policy()
self.create_enrich_pipeline()
def read_data(self):
sql = f"""
select
asin,
is_stable_flag,
is_periodic_flag,
is_ascending_flag,
max_month_last_12_month as max_bought_month_arr
from dwt_ai_asin_all
where site_name = '{self.site_name}'
"""
self.df_asin_detail = self.spark.sql(sqlQuery=sql).repartition(40, 'asin').withColumn(
"max_bought_month_arr", F.split(F.col("max_bought_month_arr"), ",")
).withColumn(
"max_bought_month_arr", F.expr("transform(max_bought_month_arr, x -> cast(x as int))")
).cache()
print("ASIN信息库数据如下:")
self.df_asin_detail.show(10, True)
def es_save(self):
print(f"创建富集索引:{self.es_index}!")
EsUtils.create_index(self.es_index, self.es_client, self.es_body)
try:
self.df_asin_detail.write.format("org.elasticsearch.spark.sql") \
.options(**self.es_options) \
.mode("append") \
.save()
print(f"ES {self.es_index} 索引更新完毕!")
except Exception as e:
print("An error occurred while writing to Elasticsearch:", str(e))
CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES数据更新失败', f'失败索引:{self.es_index}')
def create_enrich_policy(self):
# print(f"创建富集策略:{self.policy_name}!")
# policy_body = {
# "match": {
# "indices": f"{self.es_index}",
# "match_field": "asin",
# "enrich_fields": ["is_stable_flag", "is_periodic_flag", "is_ascending_flag", "max_bought_month_arr"]
# }
# }
# self.es_client.enrich.put_policy(name=self.policy_name, body=policy_body)
print(f"刷新富集策略:{self.policy_name}!")
self.es_client.enrich.execute_policy(self.policy_name, request_timeout=1800)
def create_enrich_pipeline(self):
print(f"创建富集管道:{self.pipeline_id}!")
pipeline_body = {
"description": "ai asin analyze pipeline",
"processors": [
{
"enrich": {
"policy_name": self.policy_name,
"field": "asin",
"target_field": "last_year_extra",
"max_matches": 1,
"ignore_missing": True
},
}
]
}
self.es_client.ingest.put_pipeline(id=self.pipeline_id, body=pipeline_body)
pass
if __name__ == "__main__":
site_name = sys.argv[1]
handle_obj = EsAiAsinAll(site_name)
handle_obj.run()
print("success!!!")
......@@ -48,13 +48,17 @@ class EsStDetail(TemplatesMysql):
self.record_table_name_field = f'{self.site_name}_flow_asin_last_month' if self.date_type == 'month' else f'{self.site_name}_flow_asin_last30day'
# elasticsearch相关配置
self.client = EsUtils.get_es_client()
self.es_options = EsUtils.get_es_options(self.es_index_name)
# 富集策略相关配置,用于更新 usr_mask_type 字段
self.policy_name1 = "user_mask_asin_policy"
self.policy_name2 = "user_mask_category_policy"
self.pipeline_id = "user_asin_mask_enrich_pipeline"
self.es_options = EsUtils.get_es_options(self.es_index_name, self.pipeline_id)
self.es_body = EsUtils.get_es_body()
# 正式导出需入导出记录表
if result_type == 'formal':
CommonUtil.judge_is_work_hours(site_name=site_name, date_type=date_type, date_info=date_info,
principal='wangrui4', priority=3, export_tools_type=2,
principal='chenyuanjie', priority=3, export_tools_type=2,
belonging_to_process='流量选品')
def get_date_from_week(self):
......@@ -93,7 +97,7 @@ class EsStDetail(TemplatesMysql):
current_category_rank, asin_weight_ratio, asin_bought_month, asin_lqs_rating, asin_lqs_rating_detail,
title_matching_degree, asin_lob_info, is_contains_lob_info, is_package_quantity_abnormal, zr_flow_proportion,
matrix_flow_proportion, matrix_ao_val, customer_reviews_json as product_features, img_info,
coalesce(parent_asin, asin) as collapse_asin, follow_sellers_count
coalesce(parent_asin, asin) as collapse_asin, follow_sellers_count, asin_describe, asin_fbm_price as fbm_price
from {self.table_name} where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}'
"""
print("sql:", sql)
......@@ -105,6 +109,10 @@ class EsStDetail(TemplatesMysql):
def es_prepare(self):
print("当前链接的es节点信息为:" + str(EsUtils.__es_ip__))
EsUtils.create_index(self.es_index_name, self.client, self.es_body)
# 执行富集策略
self.client.enrich.execute_policy(name=self.policy_name1)
self.client.enrich.execute_policy(name=self.policy_name2)
# EsUtils.user_enrich_pipeline(self.client, self.pipeline_id, self.policy_name1, self.policy_name2)
if self.date_type != 'month':
if not EsUtils.exist_index_alias(self.alias_name, self.client):
EsUtils.create_index_alias(self.es_index_name, self.alias_name, self.client)
......
......@@ -97,7 +97,9 @@ def save_to_doris(df_all: DataFrame):
"other_seller_name",
"buy_sales",
"updated_at",
"img_num"
"img_num",
"online_time",
"is_high_return_rate"
)
write_fields = ",".join(df_all.schema.fieldNames())
......@@ -123,10 +125,13 @@ def save_to_doris(df_all: DataFrame):
def export():
spark = SparkUtil.get_spark_session("self_asin_redis:export")
day = CommonUtil.get_sys_arg(1, CommonUtil.format_now("%Y-%m-%d"))
export_type = CommonUtil.get_sys_arg(2, "redis&&doris")
last_day = CommonUtil.get_day_offset(day, -1)
next_day = CommonUtil.get_day_offset(day, 1)
# 先删除
redis_key = f"self_asin_detail:{day}"
if "redis" in export_type:
client = RedisUtils.get_redis_client_by_type(db_type='microservice')
if client.exists(redis_key):
client.delete(redis_key)
......@@ -161,7 +166,9 @@ def export():
other_seller_name,
buy_sales,
img_num,
date_format(updated_at, '%Y-%m-%d %H:%m:%S') updated_at
date_format(updated_at, '%Y-%m-%d %H:%m:%S') updated_at,
date_format(online_time, '%Y-%m-%d %H:%m:%S') online_time,
returns
from {site_name}_self_asin_detail
where updated_at >= '{last_day}'
and updated_at <= '{next_day}'
......@@ -175,13 +182,27 @@ def export():
.where("row_number == 1") \
.drop("row_number")
asin_df = asin_df.withColumn("is_high_return_rate", F.expr("""
case
when returns = 'Customers usually keep this item' then 1
when returns = 'Frequently returned item' then 2
else 0
end
""")).drop("returns")
# 填充默认值
asin_df = na_fill(asin_df).cache()
if "redis" in export_type:
asin_df.toJSON().foreachPartition(functools.partial(save_to_redis_list, batch=5000, redis_key=redis_key, ttl=3600 * 24))
print(f"{site_name}:redis:success")
if "doris" in export_type:
save_to_doris(asin_df)
print(f"{site_name}:doris:success")
print("success all")
if "redis" in export_type:
check_total()
pass
......@@ -212,7 +233,9 @@ def na_fill(asin_df):
"account_name": "",
"other_seller_name": "",
"buy_sales": "",
"img_num": 0
"img_num": 0,
"online_time": "",
"is_high_return_rate": 0
})
pass
......@@ -284,7 +307,9 @@ def export_all():
other_seller_name,
buy_sales,
img_num,
date_format(updated_at, '%Y-%m-%d %H:%m:%S') updated_at
date_format(updated_at, '%Y-%m-%d %H:%m:%S') updated_at,
date_format(online_time, '%Y-%m-%d %H:%m:%S') online_time,
returns
from (
select max(id) as max_id
from {site_name}_self_asin_detail
......@@ -293,6 +318,13 @@ def export_all():
inner join {site_name}_self_asin_detail tmp2 on tmp1.max_id = tmp2.id
"""
asin_df = SparkUtil.read_jdbc(spark, DbTypes.mysql.name, site_name, query=query)
asin_df = asin_df.withColumn("is_high_return_rate", F.expr("""
case
when returns = 'Customers usually keep this item' then 1
when returns = 'Frequently returned item' then 2
else 0
end
""")).drop("returns")
# 填充默认值
asin_df = na_fill(asin_df)
asin_df.toJSON().foreachPartition(functools.partial(save_to_redis_list, batch=1000, redis_key=redis_key, ttl=3600 * 24 * 7))
......
......@@ -157,6 +157,6 @@ if __name__ == '__main__':
# print(schema_flag)
if not schema_flag:
msg = f"数据表:{hive_table} {msg_params},计算数据存在验证不通过,请检查数据是否异常!!具体信息请查看日志!!"
CommonUtil.send_wx_msg(['chenjianyun', 'wangrui4'], f"\u26A0 {hive_table} {msg_params}流程数据导出前验证异常", msg)
CommonUtil.send_wx_msg(['chenjianyun'], f"\u26A0 {hive_table} {msg_params}流程数据导出前验证异常", msg)
spark_session.stop()
pass
\ No newline at end of file
"""
以图搜图执行代码顺序:
# 1. 更新图片并下载到本地h7, h5运行
/mnt/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark-3.2.0-bin-hadoop3.2/demo/py_demo/img_search/img_download.py us amazon_inv 200 1
# 2. 新增图片-默认选择最近7天
/mnt/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark-3.2.0-bin-hadoop3.2/demo/py_demo/img_search/img_local_path.py us amazon_inv
# 3. 提取图片特征 -- h567--多台机器同时跑(暂时放在h5)
/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark-3.2.0-bin-hadoop3.2/demo/py_demo/img_search/img_extract_features.py us amazon_inv 1000 5
# 4. 导入图片特征数据ods
/mnt/run_shell/sqoop_shell/import/img_features.sh us amazon_inv
# 5. 切片dim
/mnt/run_shell/spark_shell/dim/img_dim_features_slice.sh us amazon_inv
# 6.1 建立索引对应关系--doris-img_hdfs_index 先导入copy表
/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark-3.2.0-bin-hadoop3.2/demo/py_demo/img_search/img_hdfs_index.py us amazon_inv
# 6.2 建立索引对应关系--hive-img_dwd_id_index
/opt/module/spark/bin/spark-submit --master yarn --driver-memory 2g --executor-memory 4g --executor-cores 1 --num-executors 1 --queue spark /opt/module/spark/demo/py_demo/img_search/img_dwd_id_index_multiprocess.py us amazon_inv 3
# 7. 导出id和index对应关系到doris(copy表)
/opt/module/spark/bin/spark-submit --master yarn --driver-memory 20g --executor-memory 20g --executor-cores 4 --num-executors 2 --queue spark /opt/module/spark/demo/py_demo/img_search/img_id_index_to_doris.py us amazon_inv
# 8. 删除索引hdfs路径相关的文件
# 删除索引hdfs路径相关的文件
hdfs dfs -rm -r /home/img_search/img_parquet/${site_name}/${img_type}/* 2>/dev/null || true
hdfs dfs -rm -r /home/img_search/img_tmp/${site_name}/${img_type}/* 2>/dev/null || true
hdfs dfs -rm -r /home/img_search/img_index/${site_name}/${img_type}/* 2>/dev/null || true
# 9. 上传parquet文件到hdfs
hdfs dfs -put /mnt/data/img_data/img_parquet/${site_name}/${img_type}/*/*.parquet /home/img_search/img_parquet/${site_name}/${img_type}/
# 10. 创建索引
/mnt/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark/demo/py_demo/img_search/img_create_index.py
# 11. 把hdfs的索引文件拉到本地
rm -rf /mnt/data/img_data/img_index/${site_name}/${img_type}/* 2>/dev/null || true
hdfs dfs -get /home/img_search/img_index/${site_name}/${img_type}/knn.index /mnt/data/img_data/img_index/${site_name}/${img_type}/
# 12. 开启接口
ssh hadoop7 systemctl restart img_search.service
# 13. 交换表名
/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark/demo/py_demo/img_search/img_alter_table_name.py ${site_name} ${img_type}
"""
\ No newline at end of file
import os
import sys
import time
import pandas as pd
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
from utils.db_util import DbTypes, DBUtil
from datetime import date
class ImgAlterTableName(Templates):
def __init__(self, site_name='us', img_type="amazon_inv"):
super(ImgAlterTableName, self).__init__()
self.site_name = site_name
self.img_type = img_type
self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
self.spark = self.create_spark_object(app_name=f"{self.db_save}: {self.site_name}")
def read_data(self):
pass
def handle_data(self):
while True:
with self.engine_doris.begin() as conn:
sql1 = "ALTER TABLE img_id_index RENAME img_id_index_temp;"
conn.execute(sql1)
sql2 = "ALTER TABLE img_id_index_copy RENAME img_id_index;"
conn.execute(sql2)
sql3 = "ALTER TABLE img_id_index_temp RENAME img_id_index_copy;"
conn.execute(sql3)
print(f"交换表名称完成--sql1: {sql1}\nsql2: {sql2}\nsql3: {sql3}")
sql_read = "select * from img_id_index limit 1"
df_read = pd.read_sql(sql_read, con=self.engine_doris)
created_time = str(list(df_read.created_time)[0])
created_date = created_time[:10] if len(created_time) > 10 else None
# 获取今天日期
today_str = date.today().strftime('%Y-%m-%d') # '2025-08-20'
if created_date == today_str:
print(f"日期校验通过: created_date--{created_date}, today_str--{today_str}")
break
else:
print(f"日期校验不通过,等待10s继续: created_date--{created_date}, today_str--{today_str}")
self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
time.sleep(10)
continue
def save_data(self):
pass
if __name__ == '__main__':
site_name = sys.argv[1]
img_type = sys.argv[2]
handle_obj = ImgAlterTableName(site_name=site_name, img_type=img_type)
handle_obj.run()
\ No newline at end of file
import os
from autofaiss import build_index
from pyspark.sql import SparkSession # pylint: disable=import-outside-toplevel
from pyspark import SparkConf, SparkContext
def create_spark_session():
# this must be a path that is available on all worker nodes
# os.environ['PYSPARK_PYTHON'] = "/opt/module/spark/demo/py_demo/img_search/autofaiss.pex"
spark = (
SparkSession.builder
.config("spark.executorEnv.PEX_ROOT", "./.pex")
.config("spark.executor.cores", "4")
.config("spark.executor.memory", "20G") # make sure to increase this if you're using more cores per executor
.config("spark.num.executors", "10")
.config("spark.yarn.queue", "spark")
.master("local") # this should point to your master node, if using the tunnelling version, keep this to localhost
.appName("autofaiss-create-index")
.getOrCreate()
)
return spark
spark = create_spark_session()
index, index_infos = build_index(
# embeddings="hdfs://nameservice1:8020/home/img_search/us/amazon_inv/parquet",
embeddings="hdfs://nameservice1:8020/home/img_search/img_parquet/us/amazon_inv",
distributed="pyspark",
file_format="parquet",
max_index_memory_usage="80G", # 16G
current_memory_available="120G", # 24G
temporary_indices_folder="hdfs://nameservice1:8020/home/img_search/img_tmp/us/amazon_inv//distributed_autofaiss_indices",
index_path="hdfs://nameservice1:8020/home/img_search/img_index/us/amazon_inv/knn.index",
index_infos_path="hdfs://nameservice1:8020/home/img_search/img_index/us/amazon_inv/infos.json",
)
print("index, index_infos:", index, index_infos)
import ast
import os
import sys
import pandas as pd
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, FloatType
class PicturesDimFeaturesSlice(Templates):
def __init__(self, site_name='us', img_type='amazon_inv'):
super(PicturesDimFeaturesSlice, self).__init__()
self.site_name = site_name
self.img_type = img_type
self.db_save = f'img_dim_features_slice'
self.spark = self.create_spark_object(
app_name=f"{self.db_save}: {self.site_name}")
self.df_asin_features = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
# self.partitions_by = ['site_name', 'block']
self.partitions_by = ['site_name', 'img_type']
self.partitions_num = 10
def read_data(self):
# sql = f"select id, asin, img_vector as embedding from ods_asin_extract_features;"
sql = f"select id, img_unique, features, img_type from img_ods_features where site_name='{self.site_name}' and img_type='{self.img_type}';"
print("sql:", sql)
self.df_save = self.spark.sql(sql).cache()
self.df_save.show(10)
print(f"self.df_save.count(): {self.df_save.count()}")
# 由于不需要在这一步生成array类型
# partitions_num = self.df_asin_features.rdd.getNumPartitions()
# print("分区数量:", partitions_num) # 642
# # self.partitions_num = 1000
# self.df_save = self.df_save.repartition(self.partitions_num)
# print("重置分区数量:", self.partitions_num) # 642
def handle_data(self):
# 定义一个将字符串转换为列表的UDF
# str_to_list_udf = F.udf(lambda s: ast.literal_eval(s), ArrayType(FloatType()))
# # 对DataFrame中的列应用这个UDF
# self.df_save = self.df_save.withColumn("embedding", str_to_list_udf(self.df_save["embedding"]))
self.df_save = self.df_save.withColumn('site_name', F.lit(self.site_name))
if __name__ == '__main__':
site_name = sys.argv[1] # 参数1:站点
handle_obj = PicturesDimFeaturesSlice(site_name=site_name)
handle_obj.run()
import os
import socket
import sys
import threading
import logging
import time
import traceback
import uuid
import pandas as pd
import redis
import requests
from sqlalchemy import text
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s', level=logging.INFO)
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.db_util import DbTypes, DBUtil, get_redis_h14
class ImgDownload(object):
def __init__(self, site_name='us', img_type="amazon_inv", thread_num=10, limit=200):
self.site_name = site_name
self.img_type = img_type
self.thread_num = thread_num
self.limit = limit
self.engine_mysql = DBUtil.get_db_engine(db_type=DbTypes.mysql.name, site_name=self.site_name)
self.client_redis = get_redis_h14()
self.hostname = socket.gethostname()
self.first_local_dir, self.read_table = self.get_first_local_dir()
# self.read_table = f"{self.site_name}_inv_img_info"
self.local_name = self.read_table
def get_first_local_dir(self):
if self.img_type == 'amazon_self':
first_local_dir = f"/mnt/data/img_data/amazon_self/{self.site_name}"
image_table = f'{self.site_name}_self_asin_image'
elif self.img_type == 'amazon':
first_local_dir = f"/mnt/data/img_data/amazon/{self.site_name}"
image_table = f'{self.site_name}_amazon_image'
elif self.img_type == 'amazon_inv':
first_local_dir = f"/mnt/data/img_data/amazon_inv/{self.site_name}"
image_table = f'{self.site_name}_inv_img_info'
else:
first_local_dir = ""
image_table = ""
return first_local_dir, image_table
def acquire_lock(self, lock_name, timeout=100):
"""
尝试获取分布式锁, 能正常设置锁的话返回True, 不能设置锁的话返回None
lock_name: 锁的key, 建议和任务名称保持一致
"""
lock_value = str(uuid.uuid4())
lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True, ex=timeout) # 可以不设置超时时间
# lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True)
return lock_acquired, lock_value
def release_lock(self, lock_name, lock_value):
"""释放分布式锁"""
script = """
if redis.call("get", KEYS[1]) == ARGV[1] then
return redis.call("del", KEYS[1])
else
return 0
end
"""
result = self.client_redis.eval(script, 1, lock_name, lock_value)
return result
@staticmethod
def img_download(img_url, img_path, img_name):
file_path = f"{img_path}{img_name}"
for d_num in range(5):
try:
response = requests.get(img_url)
if response.status_code == 200:
# Open a file in binary write mode
with open(file_path, 'wb') as file:
file.write(response.content)
# print("Image downloaded successfully.")
return True
else:
continue
except Exception as e:
error = "No such file or directory"
if error in str(e):
os.makedirs(img_path)
print(f"{d_num}次--下载图片失败, 图片路径: {file_path}, 图片url: {img_url}, \n错误信息: {e, traceback.format_exc()}")
time.sleep(2)
return False
def update_state(self, id_list, state, state_value="success"):
if id_list:
while True:
try:
with self.engine_mysql.begin() as conn:
id_tuple = tuple(id_list)
print(f"{state_value}--id_tuple: {len(id_tuple)}, {id_tuple[:10]}", )
if id_tuple:
id_tuple_str = f"('{id_tuple[0]}')" if len(id_tuple) == 1 else f"{id_tuple}"
sql_update = f"UPDATE {self.read_table} SET state={state} WHERE id IN {id_tuple_str};"
print("sql_update:", sql_update[:150])
conn.execute(sql_update)
break
except Exception as e:
print(f"读取数据错误: {e}", traceback.format_exc())
time.sleep(20)
self.engine_mysql = DBUtil.get_db_engine(db_type=DbTypes.mysql.name, site_name=self.site_name)
self.client_redis = get_redis_h14()
continue
def read_data(self):
while True:
try:
lock_acquired, lock_value = self.acquire_lock(lock_name=self.local_name)
if lock_acquired:
print("self.hostname:", self.hostname)
with self.engine_mysql.begin() as conn:
sql_read = text(f"SELECT id, img_id, img_type, img_url, id_segment FROM {self.read_table} WHERE state=1 LIMIT {self.limit};")
df = pd.read_sql(sql=sql_read, con=self.engine_mysql)
id_tuple = tuple(df.id)
print(f"sql_read: {sql_read}, {df.shape}", id_tuple[:10])
if id_tuple:
id_tuple_str = f"('{id_tuple[0]}')" if len(id_tuple) == 1 else f"{id_tuple}"
sql_update = f"UPDATE {self.read_table} SET state=2 WHERE id IN {id_tuple_str};"
print("sql_update:", sql_update[:150])
conn.execute(sql_update)
self.release_lock(lock_name=self.local_name, lock_value=lock_value)
return df
else:
print(f"当前有其它进程占用redis的锁, 等待5秒继续获取数据")
time.sleep(10) # 等待5s继续访问锁
continue
except Exception as e:
print(f"读取数据错误: {e}", traceback.format_exc())
time.sleep(20)
self.engine_mysql = DBUtil.get_db_engine(db_type=DbTypes.mysql.name, site_name=self.site_name)
self.client_redis = get_redis_h14()
continue
def handle_data(self, df, thread_id):
# 1. 下载图片
img_success_id_list = []
img_failed_id_list = []
id_list = list(df.id)
id_len = len(id_list)
for id_segment, id, img_id, img_type, img_url in zip(df.id_segment, df.id, df.img_id, df.img_type, df.img_url):
img_path = f"{self.first_local_dir}/{id_segment}/"
img_name = f"{id_segment}_{id}_{img_id}_{img_type}.jpg"
if self.hostname not in ['hadoop5', 'hadoop6', 'hadoop7', 'hadoop8']:
img_path = img_path.replace("/mnt", "/home")
d_flag = self.img_download(img_url=img_url, img_path=img_path, img_name=img_name)
id_index = id_list.index(id)
print(f"self.hostname: {self.hostname}, 线程: {thread_id}, 是否成功: {d_flag}, id_index: {id_index}, 进度: {round(id_index/id_len * 100, 2)}%, img_path: {img_path}{img_name}")
if d_flag:
img_success_id_list.append(id)
else:
img_failed_id_list.append(id)
# 2. 更改状态 -- 成功3 失败4
print(f"success: {len(img_success_id_list)}, failed: {len(img_failed_id_list)}")
self.update_state(id_list=img_success_id_list, state=3, state_value="success")
self.update_state(id_list=img_failed_id_list, state=4, state_value="failed")
def save_data(self):
pass
def run(self, thread_id=1):
while True:
try:
df = self.read_data()
if df.shape[0]:
self.handle_data(df=df, thread_id=thread_id)
self.save_data()
# break
else:
break
except Exception as e:
print(e, traceback.format_exc())
self.engine_mysql = DBUtil.get_db_engine(db_type=DbTypes.mysql.name, site_name=self.site_name)
self.client_redis = get_redis_h14()
time.sleep(20)
continue
def run_thread(self):
logging.info("所有线程处理开始")
thread_list = []
for thread_id in range(self.thread_num):
thread = threading.Thread(target=self.run, args=(thread_id, ))
thread_list.append(thread)
thread.start()
for thread in thread_list:
thread.join()
logging.info("所有线程处理完成")
if __name__ == '__main__':
# handle_obj = PicturesFeatures(self_flag='_self')
# site_name = int(sys.argv[1]) # 参数1:站点
# site_name = 'us'
# img_type = "amazon_inv"
# limit = 100
# thread_num = 1
site_name = sys.argv[1] # 参数1:站点
img_type = sys.argv[2] # 参数2:图片来源类型
limit = int(sys.argv[3]) # 参数3:每次读取的数量--1000
thread_num = int(sys.argv[4]) # 参数4:线程数量--5
handle_obj = ImgDownload(site_name=site_name, img_type=img_type, thread_num=thread_num, limit=limit)
# handle_obj.run()
handle_obj.run_thread()
\ No newline at end of file
import multiprocessing
import os
import sys
import time
import traceback
import pandas as pd
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates import Templates
from utils.db_util import DbTypes, DBUtil
class JudgeFinished(Templates):
def __init__(self, site_name='us', img_type="amazon_inv"):
super(JudgeFinished, self).__init__()
self.site_name = site_name
self.img_type = img_type
self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
self.tn_pics_hdfs_index = f"img_hdfs_index"
def judge(self):
sql = f"select * from {self.tn_pics_hdfs_index} where state in (1, 2) and site_name='{self.site_name}' and img_type='{self.img_type}';"
df = pd.read_sql(sql, con=self.engine_doris)
print(f"sql: {sql}, {df.shape}")
result_flag = True if df.shape[0] else False
return result_flag
def main(site_name='us', img_type='amazon_inv', p_num=0):
while True:
try:
judge_obj = JudgeFinished(site_name=site_name, img_type=img_type)
result_flag = judge_obj.judge()
if result_flag:
print(f"继续, result_flag: {result_flag}")
os.system(f"/opt/module/spark/bin/spark-submit --master yarn --driver-memory 5g --executor-memory 10g --executor-cores 2 --num-executors 1 --queue spark /opt/module/spark/demo/py_demo/img_search/img_dwd_id_index.py {site_name} {img_type}")
else:
print(f"结束, result_flag: {result_flag}")
break
except Exception as e:
print(e, traceback.format_exc())
time.sleep(20)
error = "ValueError: Length mismatch: Expected axis has 0 elements"
if error in e:
print(f"当前已经跑完所有block块id对应的index关系,退出进程-{p_num}")
quit()
continue
if __name__ == "__main__":
site_name = sys.argv[1]
img_type = sys.argv[2]
process_num = int(sys.argv[3]) # 参数1:进程数
processes = []
for p_num in range(process_num): # 用于设定进程数量
process = multiprocessing.Process(target=main, args=(site_name, img_type, p_num))
process.start()
processes.append(process)
# 等待所有进程完成
for process in processes:
process.join()
import os
import sys
import threading
import time
import traceback
import socket
import uuid
import numpy as np
import pandas as pd
import redis
import logging
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
# from utils.templates import Templates
from sqlalchemy import text
from vgg_model import VGGNet
from utils.db_util import DbTypes, DBUtil, get_redis_h14
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s', level=logging.INFO)
class ImgExtractFeatures(object):
def __init__(self, site_name='us', img_type="amazon_inv", thread_num=10, limit=1000):
# super(ImgFeatures, self).__init__()
self.site_name = site_name
self.img_type = img_type
self.thread_num = thread_num
self.limit = limit
self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
self.client_redis = get_redis_h14()
self.local_name = f"{self.site_name}_img_features"
self.vgg_model = VGGNet()
self.hostname = socket.gethostname()
self.read_table = f"img_local_path"
self.save_table = f"img_features"
def acquire_lock(self, lock_name, timeout=100):
"""
尝试获取分布式锁, 能正常设置锁的话返回True, 不能设置锁的话返回None
lock_name: 锁的key, 建议和任务名称保持一致
"""
lock_value = str(uuid.uuid4())
lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True, ex=timeout) # 可以不设置超时时间
# lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True)
return lock_acquired, lock_value
def release_lock(self, lock_name, lock_value):
"""释放分布式锁"""
script = """
if redis.call("get", KEYS[1]) == ARGV[1] then
return redis.call("del", KEYS[1])
else
return 0
end
"""
result = self.client_redis.eval(script, 1, lock_name, lock_value)
return result
def read_data(self):
while True:
try:
lock_acquired, lock_value = self.acquire_lock(lock_name=self.local_name)
if lock_acquired:
print("self.hostname:", self.hostname)
with self.engine_doris.begin() as conn:
sql_read = text(f"SELECT id, img_unique, local_path, img_type FROM selection.{self.read_table} WHERE site_name='{self.site_name}' and img_type='{self.img_type}' and state=1 LIMIT {self.limit};")
# result = conn.execute(sql_read)
# df = pd.DataFrame(result.fetchall())
df = pd.read_sql(sql=sql_read, con=self.engine_doris)
img_unique_tuple = tuple(df.img_unique)
print(f"sql_read: {sql_read}, {df.shape}", img_unique_tuple[:10])
if img_unique_tuple:
img_unique_tuple_str = f"('{img_unique_tuple[0]}')" if len(img_unique_tuple) == 1 else f"{img_unique_tuple}"
sql_update = text(f"UPDATE selection.{self.read_table} SET state=2 WHERE img_unique IN {img_unique_tuple_str};")
print("sql_update:", sql_update)
conn.execute(sql_update)
self.release_lock(lock_name=self.local_name, lock_value=lock_value)
return df
else:
print(f"当前有其它进程占用redis的锁, 等待5秒继续获取数据")
time.sleep(5) # 等待5s继续访问锁
continue
except Exception as e:
print(f"读取数据错误: {e}", traceback.format_exc())
time.sleep(5)
self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
continue
def handle_data(self, df, thread_id):
id_list = list(df.id)
img_unique_list = list(df.img_unique)
local_path_list = list(df.local_path)
data_list = []
for id, img_unique, local_path in zip(id_list, img_unique_list, local_path_list):
index = id_list.index(id)
print(f"thread_id, index, id, img_unique, local_path: {thread_id, index, id, img_unique, local_path}")
if self.hostname not in ['hadoop5', 'hadoop6', 'hadoop7', 'hadoop8']:
local_path = local_path.replace("/mnt", "/home")
try:
features = self.vgg_model.vgg_extract_feat(file=local_path)
except Exception as e:
print(e, traceback.format_exc())
features = list(np.zeros(shape=(512,)))
data_list.append([id, img_unique, str(features), self.img_type, self.site_name])
columns = ['id', 'img_unique', 'features', 'img_type', 'site_name']
df_save = pd.DataFrame(data_list, columns=columns)
return df_save
def save_data(self, df):
df.to_sql(self.save_table, con=self.engine_doris, if_exists="append", index=False)
with self.engine_doris.begin() as conn:
img_unique_tuple = tuple(df.img_unique)
if img_unique_tuple:
img_unique_tuple_str = f"('{img_unique_tuple[0]}')" if len(img_unique_tuple) == 1 else f"{img_unique_tuple}"
sql_update = f"update selection.{self.read_table} set state=3 where img_unique in {img_unique_tuple_str};"
print(f"sql_update: {sql_update}")
conn.execute(sql_update)
def run(self, thread_id=1):
while True:
try:
df = self.read_data()
if df.shape[0]:
df_save = self.handle_data(df=df, thread_id=thread_id)
self.save_data(df=df_save)
# break
else:
break
except Exception as e:
print(e, traceback.format_exc())
self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
self.client_redis = get_redis_h14()
self.vgg_model = VGGNet()
time.sleep(20)
continue
def run_thread(self):
thread_list = []
for thread_id in range(self.thread_num):
thread = threading.Thread(target=self.run, args=(thread_id, ))
thread_list.append(thread)
thread.start()
for thread in thread_list:
thread.join()
logging.info("所有线程处理完成")
if __name__ == '__main__':
# handle_obj = PicturesFeatures(self_flag='_self')
# site_name = int(sys.argv[1]) # 参数1:站点
# site_name = 'us'
# img_type = "amazon_inv"
# limit = 100
# thread_num = 1
site_name = sys.argv[1] # 参数1:站点
img_type = sys.argv[2] # 参数2:图片来源类型
limit = int(sys.argv[3]) # 参数3:每次读取的数量--1000
thread_num = int(sys.argv[4]) # 参数4:线程数量--5
handle_obj = ImgExtractFeatures(site_name=site_name, img_type=img_type, thread_num=thread_num, limit=limit)
# handle_obj.run()
handle_obj.run_thread()
\ No newline at end of file
This diff is collapsed. Click to expand it.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment