Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
5bf88f84
Commit
5bf88f84
authored
Mar 20, 2026
by
吴济苍
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/developer' into developer
parents
67c72130
5ead1c9f
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
58 additions
and
51 deletions
+58
-51
dwt_flow_asin.py
Pyspark_job/dwt/dwt_flow_asin.py
+20
-3
dwt_nsr_asin_detail.py
Pyspark_job/dwt/dwt_nsr_asin_detail.py
+1
-1
es_ai_asin_add.py
Pyspark_job/export_es/es_ai_asin_add.py
+25
-0
es_asin_profit_rate.py
Pyspark_job/export_es/es_asin_profit_rate.py
+0
-0
import_st_to_pg14.py
Pyspark_job/listen_program/import_st_to_pg14.py
+2
-1
export_asin_without_keepa.py
Pyspark_job/script/export_asin_without_keepa.py
+10
-4
export_dwt_ai_asin_add.py
Pyspark_job/sqoop_export/export_dwt_ai_asin_add.py
+0
-42
No files found.
Pyspark_job/dwt/dwt_flow_asin.py
View file @
5bf88f84
...
...
@@ -521,10 +521,27 @@ class DwtFlowAsin(Templates):
CASE WHEN hide_flag = 1 THEN 1 WHEN category_first_id = 'grocery' and category_id != '6492272011' THEN 1
WHEN category_id in ('21393128011', '21377129011', '21377127011', '21377130011', '21388218011', '21377132011') THEN 1
ELSE 0 END"""
))
.
drop
(
"hide_flag"
)
self
.
df_asin_detail
=
self
.
df_asin_detail
.
withColumn
(
"asin_is_need"
,
F
.
expr
(
"""
CASE WHEN category_first_id in ('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed') THEN 1
WHEN asin NOT LIKE 'B0
%
' THEN 1
# 解析 asin_category_desc 取 › 分隔的第一个元素作为补充分类名称
self
.
df_asin_detail
=
self
.
df_asin_detail
.
withColumn
(
"desc_category_first_name"
,
F
.
lower
(
F
.
trim
(
F
.
split
(
F
.
col
(
"asin_category_desc"
),
"›"
)
.
getItem
(
0
)))
)
# 读取 Hive 分类维表,获取分类名称与ID的对应关系
sql_dim
=
f
"""
select lower(trim(en_name)) as desc_category_first_name, category_first_id as desc_category_first_id
from dim_bsr_category_tree where site_name = '{self.site_name}' and category_parent_id = 0 and leaf_node = 2
"""
df_bsr_category
=
F
.
broadcast
(
self
.
spark
.
sql
(
sqlQuery
=
sql_dim
))
# join 补充分类ID
self
.
df_asin_detail
=
self
.
df_asin_detail
.
join
(
df_bsr_category
,
on
=
[
'desc_category_first_name'
],
how
=
'left'
)
# 两个分类ID均在过滤列表中才标记为1
need_categories
=
"('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed')"
self
.
df_asin_detail
=
self
.
df_asin_detail
.
withColumn
(
"asin_is_need"
,
F
.
expr
(
f
"""
CASE WHEN category_first_id in {need_categories}
AND desc_category_first_id in {need_categories} THEN 1
WHEN asin NOT LIKE 'B0
%
' THEN 1
ELSE 0 END"""
))
self
.
df_asin_detail
=
self
.
df_asin_detail
.
drop
(
"desc_category_first_name"
,
"desc_category_first_id"
)
self
.
df_asin_detail
=
self
.
df_asin_detail
.
withColumn
(
"asin_type"
,
F
.
expr
(
"""
CASE WHEN asin_is_self=1 THEN 1 WHEN asin_is_need=1 THEN 2 WHEN asin_is_hide=1 THEN 3 ELSE 0 END"""
))
.
drop
(
"asin_is_self"
,
"asin_is_need"
,
"asin_is_hide"
)
...
...
Pyspark_job/dwt/dwt_nsr_asin_detail.py
View file @
5bf88f84
...
...
@@ -154,7 +154,7 @@ class DwtNsrAsinDetail(object):
select asin,
asin_ao_val as ao_val,
bsr_orders as bsr_orders,
asin_bsr_orders_
change
as bsr_orders_change,
asin_bsr_orders_
mom
as bsr_orders_change,
asin_air_freight_gross_margin as asin_air_freight_gross_margin,
asin_ocean_freight_gross_margin as asin_ocean_freight_gross_margin,
cast(asin_bought_month as int ) as asin_bought_month
...
...
Pyspark_job/export_es/es_ai_asin_add.py
View file @
5bf88f84
...
...
@@ -200,7 +200,32 @@ class EsAiAsinAdd(object):
'total_comments'
,
'uses'
,
'variation_flag'
,
'variation_num'
,
'weight'
)
def
create_pg_table
(
self
):
"""
根据模板表创建月度 PG 表:
1. LIKE INCLUDING ALL:复制所有字段类型、其他列默认值、约束、索引
2. 重建 id 列独立序列,避免与模板表共享同一序列
"""
template_tb
=
"us_ai_asin_detail_month_2026_01"
engine
=
DBUtil
.
get_db_engine
(
"postgresql"
,
"us"
)
# 表已存在则跳过
result
=
DBUtil
.
engine_exec_sql
(
engine
,
f
"SELECT to_regclass('{self.export_pg_tb}')"
)
if
list
(
result
)[
0
][
0
]
is
not
None
:
print
(
f
"PostgreSQL 表 {self.export_pg_tb} 已存在,跳过建表"
)
return
# 建表 + 为 id 列创建独立序列
sql
=
f
"""
CREATE TABLE {self.export_pg_tb} (LIKE {template_tb} INCLUDING ALL);
ALTER TABLE {self.export_pg_tb} ALTER COLUMN id DROP DEFAULT;
CREATE SEQUENCE {self.export_pg_tb}_id_seq OWNED BY {self.export_pg_tb}.id;
ALTER TABLE {self.export_pg_tb} ALTER COLUMN id SET DEFAULT nextval('{self.export_pg_tb}_id_seq')
"""
DBUtil
.
exec_sql
(
"postgresql"
,
"us"
,
sql
)
print
(
f
"PostgreSQL 表 {self.export_pg_tb} 创建完成(独立自增序列)"
)
def
save_data
(
self
):
# 创建月度 PG 表
self
.
create_pg_table
()
# 将新增asin导出给济苍
try
:
self
.
df_save_pg
.
write
.
format
(
"jdbc"
)
\
...
...
Pyspark_job/export_es/es_asin_profit_rate.py
View file @
5bf88f84
This diff is collapsed.
Click to expand it.
Pyspark_job/listen_program/import_st_to_pg14.py
View file @
5bf88f84
...
...
@@ -21,7 +21,7 @@ class ImportStToPg14(object):
self
.
df_st_month
=
pd
.
DataFrame
()
self
.
df_st_month_state
=
pd
.
DataFrame
()
self
.
df_save
=
pd
.
DataFrame
()
self
.
fetch_year_month_by_week
()
# 如果传的date_type='week', 将date_info转换成月的值
#
self.fetch_year_month_by_week() # 如果传的date_type='week', 将date_info转换成月的值
self
.
year
,
self
.
month
=
self
.
date_info
.
split
(
"-"
)[
0
],
int
(
self
.
date_info
.
split
(
"-"
)[
1
])
def
fetch_year_month_by_week
(
self
):
...
...
@@ -31,6 +31,7 @@ class ImportStToPg14(object):
self
.
date_info
=
list
(
df
.
year_month
)[
0
]
def
read_data
(
self
):
self
.
fetch_year_month_by_week
()
# 如果传的date_type='week', 将date_info转换成月的值
# 1. 读取date_20_to_30表获取月份对应的周
sql_get_week
=
f
"select year_week, year, week from selection.date_20_to_30 WHERE `year_month`='{self.date_info}' and week_day=1"
df_week
=
pd
.
read_sql
(
sql_get_week
,
con
=
self
.
engine_mysql
)
...
...
Pyspark_job/script/export_asin_without_keepa.py
View file @
5bf88f84
...
...
@@ -221,11 +221,17 @@ class ExportAsinWithoutKeepa(object):
df
=
df
.
cache
()
print
(
f
"筛选后数据量: {df.count()}"
)
# 排除 dim_keepa_asin_info 中已有 package_length 的ASIN
# 排除 dim_keepa_asin_info 中已有有效keepa数据的ASIN
# 若 package_length/width/height/weight 任意一个 < 0,视为数据异常,不排除(需重新抓取)
print
(
"7. 排除已有keepa数据的ASIN (dim_keepa_asin_info)"
)
df_keepa
=
self
.
spark
.
sql
(
"select asin from dim_keepa_asin_info where package_length is not null"
)
.
repartition
(
40
,
'asin'
)
df_keepa
=
self
.
spark
.
sql
(
f
"""
select asin from dim_keepa_asin_info
where site_name = '{self.site_name}'
and package_length >= 0
and package_width >= 0
and package_height >= 0
and weight >= 0
"""
)
.
repartition
(
40
,
'asin'
)
df
=
df
.
join
(
df_keepa
,
on
=
'asin'
,
how
=
'left_anti'
)
.
cache
()
print
(
f
"排除keepa后数据量: {df.count()}"
)
...
...
Pyspark_job/sqoop_export/export_dwt_ai_asin_add.py
deleted
100644 → 0
View file @
67c72130
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_type
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
3
,
None
)
print
(
f
"执行参数为{sys.argv}"
)
# 获取数据库引擎
db_type
=
"postgresql_15"
engine
=
get_remote_engine
(
site_name
=
'us'
,
db_type
=
db_type
)
if
site_name
==
'us'
:
export_tb
=
f
"ai_asin_detail_month_{date_info.replace('-', '_')}"
else
:
export_tb
=
f
"{site_name}_ai_asin_detail_month_{date_info.replace('-', '_')}"
# 导出数据
engine
.
sqoop_raw_export
(
hive_table
=
"dwt_ai_asin_add"
,
import_table
=
export_tb
,
partitions
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
},
m
=
30
,
cols
=
"site_name,asin,weight,bought_month,category,img,title,brand,account_name,account_addr,buy_box_seller_type,"
"launch_time,img_num,variation_flag,variation_num,ao_val,category_id,category_current_id,parent_asin,bsr_rank,"
"price,rating,total_comments,seller_id,fb_country_name,review_json_list,launch_time_type,describe,product_json,"
"product_detail_json,bought_month_mom,bought_month_yoy,is_new_flag,is_ascending_flag"
)
print
(
"success"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment