Commit 8d26273e by chenyuanjie

利润率每日更新

parent 72393bc8
...@@ -7,29 +7,27 @@ from utils.spark_util import SparkUtil ...@@ -7,29 +7,27 @@ from utils.spark_util import SparkUtil
from utils.es_util import EsUtils from utils.es_util import EsUtils
from pyspark.sql import functions as F from pyspark.sql import functions as F
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from datetime import datetime, timedelta
class EsAsinProfitRate(object): class EsAsinProfitRate(object):
def __init__(self, site_name): def __init__(self, site_name, date_info):
self.site_name = site_name self.site_name = site_name
self.date_info = date_info
self.last_date_info = (datetime.strptime(date_info, "%Y-%m-%d").date() - timedelta(days=1)).strftime("%Y-%m-%d")
self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}") self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")
# ES相关配置 # ES相关配置
self.es_client = EsUtils.get_es_client() self.es_client = EsUtils.get_es_client()
self.es_index = f"{self.site_name}_profit_rate_extra" self.es_profit_rate_index = f"{self.site_name}_profit_rate_extra_v2"
self.es_body = self.get_es_body() self.es_profit_rate_body = self.get_es_profit_rate_body()
self.es_options = self.get_es_options(self.es_index) self.es_profit_rate_options = self.get_es_profit_rate_options(self.es_profit_rate_index)
self.profit_rate_policy = f"{self.site_name}_profit_rate_policy"
self.user_mask_asin_policy = "user_mask_asin_policy"
self.user_mask_category_policy = "user_mask_category_policy"
self.pipeline_id = f"{self.site_name}_user_mask_and_profit_rate_pipeline"
self.df_asin_profit_rate = self.spark.sql(f"select 1+1;") self.df_asin_profit_rate = self.spark.sql(f"select 1+1;")
self.df_keepa_asin = self.spark.sql(f"select 1+1;")
@staticmethod @staticmethod
def get_es_body(): def get_es_profit_rate_body():
return { return {
"settings": { "settings": {
"number_of_shards": "3", "number_of_shards": "3",
...@@ -51,13 +49,17 @@ class EsAsinProfitRate(object): ...@@ -51,13 +49,17 @@ class EsAsinProfitRate(object):
}, },
"air_profit": { "air_profit": {
"type": "float" "type": "float"
},
"update_time": {
"type": "date",
"format": "yyyy-MM-dd"
} }
} }
} }
} }
@staticmethod @staticmethod
def get_es_options(index_name): def get_es_profit_rate_options(index_name):
return { return {
"es.nodes": EsUtils.__es_ip__, "es.nodes": EsUtils.__es_ip__,
"es.port": EsUtils.__es_port__, "es.port": EsUtils.__es_port__,
...@@ -74,173 +76,121 @@ class EsAsinProfitRate(object): ...@@ -74,173 +76,121 @@ class EsAsinProfitRate(object):
} }
def run(self): def run(self):
self.read_data() self.save_profit_rate_add()
self.es_save() self.update_history_index()
self.create_enrich_policy()
self.create_enrich_pipeline()
def read_data(self): def save_profit_rate_add(self):
# 读取利润率整合数据(增量数据)
sql = f""" sql = f"""
select asin, price, ocean_profit, air_profit, package_length, package_width, package_height, weight select asin, price, ocean_profit, air_profit, updated_time from dim_asin_profit_rate_info
from dim_asin_profit_rate_info where site_name = '{self.site_name}' where site_name = '{self.site_name}' and updated_time >= '{self.last_date_info}'
""" """
self.df_asin_profit_rate = self.spark.sql(sqlQuery=sql).repartition(40, 'asin') self.df_asin_profit_rate = self.spark.sql(sqlQuery=sql).repartition(40, 'asin')
self.df_asin_profit_rate = self.df_asin_profit_rate.withColumn(
sql = f"""
select asin, package_length, package_width, package_height, weight
from dim_keepa_asin_info where site_name = '{self.site_name}'
"""
self.df_keepa_asin = self.spark.sql(sqlQuery=sql).repartition(40, 'asin')
# 因为 dim_asin_profit_rate_info 存在重复计算利润率的情况,保留与keepa最新数据所对应的数据行
self.df_asin_profit_rate = self.df_asin_profit_rate.join(
self.df_keepa_asin, on=['asin', 'package_length', 'package_width', 'package_height', 'weight'], how='inner'
).select(
'asin', 'price', 'ocean_profit', 'air_profit'
).withColumn(
'profit_key', F.concat_ws("_", F.col("asin"), F.col("price")) 'profit_key', F.concat_ws("_", F.col("asin"), F.col("price"))
).withColumn(
"update_time",
F.when(
F.col("updated_time").isNotNull(),
F.substring(F.col("updated_time"), 1, 10)
).otherwise(F.lit("1970-01-01"))
).select(
'profit_key', 'asin', 'price', 'ocean_profit', 'air_profit', 'update_time'
).cache() ).cache()
print(f"增量利润率数据如下:")
self.df_asin_profit_rate.show(10, False)
def es_save(self): print(f"创建利润率索引:{self.es_profit_rate_index}!")
print(f"创建富集索引:{self.es_index}!") EsUtils.create_index(self.es_profit_rate_index, self.es_client, self.es_profit_rate_body)
EsUtils.create_index(self.es_index, self.es_client, self.es_body)
try: try:
self.df_asin_profit_rate.write.format("org.elasticsearch.spark.sql") \ # self.df_asin_profit_rate.write.format("org.elasticsearch.spark.sql") \
.options(**self.es_options) \ # .options(**self.es_profit_rate_options) \
.mode("append") \ # .mode("append") \
.save() # .save()
print(f"ES {self.es_index} 索引更新完毕!") print(f"ES {self.es_profit_rate_index} 索引更新完毕!")
except Exception as e: except Exception as e:
print("An error occurred while writing to Elasticsearch:", str(e)) print("An error occurred while writing to Elasticsearch:", str(e))
CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES数据更新失败', f'失败索引:{self.es_index}') CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES数据更新失败', f'失败索引:{self.es_profit_rate_index}')
def create_enrich_policy(self): def update_history_index(self):
# self.es_client.ingest.delete_pipeline(id=self.pipeline_id) """更新历史月度索引的利润率数据"""
# self.es_client.enrich.delete_policy(name=self.policy_name) # 从 2025-05 开始,遍历到最新索引
# print(f"创建富集策略:{self.policy_name}!") start_date = datetime(2025, 5, 1)
# policy_body = { current_date = start_date
# "match": {
# "indices": f"{self.es_index}", while True:
# "match_field": "profit_key", year = current_date.year
# "enrich_fields": ["ocean_profit", "air_profit"] month = current_date.month
# } month_str = f"{year}-{month:02d}"
# } index_name = f"{self.site_name}_st_detail_month_{year}_{month:02d}"
# self.es_client.enrich.put_policy(name=self.policy_name, body=policy_body)
# 检查索引是否存在
print(f"刷新富集策略:{self.profit_rate_policy}!") if not self.es_client.indices.exists(index=index_name):
self.es_client.enrich.execute_policy(self.profit_rate_policy, request_timeout=1800) print(f"索引 {index_name} 不存在,停止遍历")
break
def create_enrich_pipeline(self):
print(f"创建富集管道:{self.pipeline_id}!") print(f"\n{'='*60}")
pipeline_body = { print(f"开始处理索引: {index_name}")
"description": "asin profit_rate and user_mask pipeline", print(f"{'='*60}")
"processors": [
{ try:
"enrich": { self.update_single_history_index(index_name, month_str)
"policy_name": self.profit_rate_policy, except Exception as e:
"field": "profit_key", print(f"更新索引 {index_name} 失败: {str(e)}")
"target_field": "profit_rate_extra",
"max_matches": 1, # 移动到下个月
"ignore_missing": True if month == 12:
}, current_date = datetime(year + 1, 1, 1)
}, else:
{ current_date = datetime(year, month + 1, 1)
"enrich": {
"policy_name": f"{self.user_mask_asin_policy}", def update_single_history_index(self, index_name, month_str):
"field": "asin", """更新单个历史索引"""
"target_field": "policy_add_1", hive_sql = f"""
"max_matches": 1, select asin, asin_price as price from dwt_flow_asin where site_name = '{self.site_name}' and date_type = 'month'
"ignore_missing": True and date_info = '{month_str}' and asin_price is not null
}, """
}, df_hive = self.spark.sql(hive_sql)
{
"enrich": {
"policy_name": f"{self.user_mask_category_policy}",
"field": "category_id",
"target_field": "policy_add_2",
"max_matches": 1,
"ignore_missing": True
},
},
{
"set": {
"field": "usr_mask_type",
"value": "{{policy_add_1.usr_mask_type}}",
"ignore_empty_value": True
}
},
{
"set": {
"field": "usr_mask_progress",
"value": "{{policy_add_1.usr_mask_progress}}",
"ignore_empty_value": True
}
},
{
"set": {
"field": "package_quantity",
"value": "{{policy_add_1.package_quantity}}",
"ignore_empty_value": True
}
},
{
"set": {
"field": "usr_mask_type",
"value": "{{policy_add_2.usr_mask_type}}",
"ignore_empty_value": True
}
},
{
"remove": {
"field": "policy_add_1",
"ignore_missing": True
}
},
{
"remove": {
"field": "policy_add_2",
"ignore_missing": True
}
},
{
"convert": {
"field": "package_quantity",
"type": "integer",
"ignore_missing": True
}
}
]
}
self.es_client.ingest.put_pipeline(id=self.pipeline_id, body=pipeline_body) df_update = self.df_asin_profit_rate.join(
df_hive, on=['asin', 'price'], how='inner'
# 刷新ES数据,使pipeline生效 ).withColumn(
# body = { "profit_rate_extra",
# "query": { F.struct(
# "bool": { F.col("ocean_profit").alias("ocean_profit"),
# "must_not": { F.col("air_profit").alias("air_profit")
# "exists": { )
# "field": "profit_rate_extra" ).select("asin", "profit_rate_extra")
# }
# } es_options = {
# } "es.nodes": EsUtils.__es_ip__,
# } "es.port": EsUtils.__es_port__,
# } "es.net.http.auth.user": EsUtils.__es_user__,
body = { "es.net.http.auth.pass": EsUtils.__es_passwd__,
"es.mapping.id": "asin",
"es.resource": f"{index_name}/_doc",
"es.batch.write.refresh": "false",
"es.batch.size.entries": "5000",
"es.write.operation": "update",
"es.batch.write.retry.count": "3",
"es.batch.write.retry.wait": "10s",
"es.internal.es.version.ignore": "true" # 忽略版本检查
} }
self.es_client.update_by_query(
index="us_st_detail_month_2025_11", print(f"索引 {index_name} 待更新数据量: {df_update.count()}")
body=body, df_update.show(5, False)
pipeline=self.pipeline_id,
refresh=True, df_update.write.format("org.elasticsearch.spark.sql") \
wait_for_completion=False, .options(**es_options) \
request_timeout=600 .mode("append") \
) .save()
pass
print(f"索引 {index_name} 更新完毕!")
if __name__ == "__main__": if __name__ == "__main__":
site_name = sys.argv[1] site_name = sys.argv[1]
handle_obj = EsAsinProfitRate(site_name) date_info = sys.argv[2]
handle_obj = EsAsinProfitRate(site_name, date_info)
handle_obj.run() handle_obj.run()
print("success!!!") print("success!!!")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment