Commit e6ac1bbf by wangjing

no message

parent cb4b4b8a
......@@ -4,7 +4,6 @@ import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.db_util import DbTypes
from utils.secure_db_client import get_remote_engine
if __name__ == '__main__':
......@@ -12,40 +11,53 @@ if __name__ == '__main__':
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
assert site_name is not None, "site_name 不能为空!"
assert date_type is not None, "date_type 不能为空!"
assert date_info is not None, "date_info 不能为空!"
year = date_info.split("-")[0]
db_type = 'postgresql_15'
import_table = f"{site_name}_self_asin_detail_{year}"
hive_table = "ods_self_asin_detail"
partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
engine = get_remote_engine(
site_name=site_name,
db_type=db_type
)
db_type = DbTypes.postgresql.name
year = CommonUtil.reformat_date(date_info, "%Y-%m-%d", "%Y", )
query = f"""
select
SELECT
asin,
img_url,
title,
REPLACE(REPLACE(REPLACE(img_url, E'\n',' '), E'\r',' '), E'\t',' ') AS img_url,
REPLACE(REPLACE(REPLACE(title, E'\n',' '), E'\r',' '), E'\t',' ') AS title,
title_len,
price,
rating,
total_comments,
buy_box_seller_type,
page_inventory,
category,
volume,
REPLACE(REPLACE(REPLACE(category, E'\n',' '), E'\r',' '), E'\t',' ') AS category,
REPLACE(REPLACE(REPLACE(volume, E'\n',' '), E'\r',' '), E'\t',' ') AS volume,
weight,
rank,
launch_time,
video_url,
add_url,
material,
created_at,
REPLACE(REPLACE(REPLACE(video_url, E'\n',' '), E'\r',' '), E'\t',' ') AS video_url,
REPLACE(REPLACE(REPLACE(add_url, E'\n',' '), E'\r',' '), E'\t',' ') AS add_url,
REPLACE(REPLACE(REPLACE(material, E'\n',' '), E'\r',' '), E'\t',' ') AS material,
img_num,
img_type,
REPLACE(REPLACE(REPLACE(img_type, E'\n',' '), E'\r',' '), E'\t',' ') AS img_type,
qa_num,
brand,
ac_name,
node_id,
sp_num,
mpn,
REPLACE(REPLACE(REPLACE(brand, E'\n',' '), E'\r',' '), E'\t',' ') AS brand,
REPLACE(REPLACE(REPLACE(ac_name, E'\n',' '), E'\r',' '), E'\t',' ') AS ac_name,
REPLACE(REPLACE(REPLACE(node_id, E'\n',' '), E'\r',' '), E'\t',' ') AS node_id,
REPLACE(REPLACE(REPLACE(sp_num, E'\n',' '), E'\r',' '), E'\t',' ') AS sp_num,
REPLACE(REPLACE(REPLACE(mpn, E'\n',' '), E'\r',' '), E'\t',' ') AS mpn,
online_time,
describe,
REPLACE(REPLACE(REPLACE(describe, E'\n',' '), E'\r',' '), E'\t',' ') AS describe,
one_star,
two_star,
three_star,
......@@ -53,38 +65,64 @@ if __name__ == '__main__':
five_star,
low_star,
asin_type,
is_coupon,
search_category,
weight_str,
account_name,
other_seller_name,
account_id
from {site_name}_self_asin_detail_{year}
where 1 = 1
and site = '{site_name}'
and bsr_date_info = '{date_info}'
and date_info >= '{date_info}'
and \$CONDITIONS
"""
REPLACE(REPLACE(REPLACE(is_coupon, E'\n',' '), E'\r',' '), E'\t',' ') AS is_coupon,
REPLACE(REPLACE(REPLACE(search_category, E'\n',' '), E'\r',' '), E'\t',' ') AS search_category,
REPLACE(REPLACE(REPLACE(weight_str, E'\n',' '), E'\r',' '), E'\t',' ') AS weight_str,
REPLACE(REPLACE(REPLACE(account_name, E'\n',' '), E'\r',' '), E'\t',' ') AS account_name,
REPLACE(REPLACE(REPLACE(other_seller_name, E'\n',' '), E'\r',' '), E'\t',' ') AS other_seller_name,
REPLACE(REPLACE(REPLACE(bsr_date_info, E'\n',' '), E'\r',' '), E'\t',' ') AS bsr_date_info,
REPLACE(REPLACE(REPLACE(account_id, E'\n',' '), E'\r',' '), E'\t',' ') AS account_id,
REPLACE(REPLACE(REPLACE(package_quantity, E'\n',' '), E'\r',' '), E'\t',' ') AS package_quantity,
REPLACE(REPLACE(REPLACE(pattern_name, E'\n',' '), E'\r',' '), E'\t',' ') AS pattern_name,
REPLACE(REPLACE(REPLACE(together_asin, E'\n',' '), E'\r',' '), E'\t',' ') AS together_asin,
REPLACE(REPLACE(REPLACE(activity_type, E'\n',' '), E'\r',' '), E'\t',' ') AS activity_type,
one_two_val,
three_four_val,
five_six_val,
eight_val,
REPLACE(REPLACE(REPLACE(product_description, E'\n',' '), E'\r',' '), E'\t',' ') AS product_description,
follow_sellers,
REPLACE(REPLACE(REPLACE(buy_sales, E'\n',' '), E'\r',' '), E'\t',' ') AS buy_sales,
image_view,
REPLACE(REPLACE(REPLACE(product_json, E'\n',' '), E'\r',' '), E'\t',' ') AS product_json,
REPLACE(REPLACE(REPLACE(productdetail_json, E'\n',' '), E'\r',' '), E'\t',' ') AS productdetail_json,
REPLACE(REPLACE(REPLACE(review_ai_text, E'\n',' '), E'\r',' '), E'\t',' ') AS review_ai_text,
REPLACE(REPLACE(REPLACE(review_label_json, E'\n',' '), E'\r',' '), E'\t',' ') AS review_label_json,
REPLACE(REPLACE(REPLACE(lob_asin_json, E'\n',' '), E'\r',' '), E'\t',' ') AS lob_asin_json,
REPLACE(REPLACE(REPLACE(sp_initial_seen_asins_json, E'\n',' '), E'\r',' '), E'\t',' ') AS sp_initial_seen_asins_json,
REPLACE(REPLACE(REPLACE(sp_4stars_initial_seen_asins_json, E'\n',' '), E'\r',' '), E'\t',' ') AS sp_4stars_initial_seen_asins_json,
REPLACE(REPLACE(REPLACE(sp_delivery_initial_seen_asins_json, E'\n',' '), E'\r',' '), E'\t',' ') AS sp_delivery_initial_seen_asins_json,
REPLACE(REPLACE(REPLACE(compare_similar_asin_json, E'\n',' '), E'\r',' '), E'\t',' ') AS compare_similar_asin_json,
REPLACE(REPLACE(REPLACE(customer_reviews_json, E'\n',' '), E'\r',' '), E'\t',' ') AS customer_reviews_json,
REPLACE(REPLACE(REPLACE(together_asin_json, E'\n',' '), E'\r',' '), E'\t',' ') AS together_asin_json,
REPLACE(REPLACE(REPLACE(min_match_asin_json, E'\n',' '), E'\r',' '), E'\t',' ') AS min_match_asin_json,
REPLACE(REPLACE(REPLACE(seller_json, E'\n',' '), E'\r',' '), E'\t',' ') AS seller_json,
returns,
created_at, updated_at,
REPLACE(REPLACE(REPLACE(result_list_json, E'\n',' '), E'\r',' '), E'\t',' ') AS result_list_json,
REPLACE(REPLACE(REPLACE(variat_list, E'\n',' '), E'\r',' '), E'\t',' ') AS variat_list,
REPLACE(REPLACE(REPLACE(bundle_asin_component_json, E'\n',' '), E'\r',' '), E'\t',' ') AS bundle_asin_component_json,
REPLACE(REPLACE(REPLACE(cart_type, E'\n',' '), E'\r',' '), E'\t',' ') AS cart_type,
REPLACE(REPLACE(REPLACE(site, E'\n',' '), E'\r',' '), E'\t',' ') AS site
hive_tb = "ods_self_asin_detail"
partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info,
}
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
FROM {import_table}
WHERE date_info='{date_info}' AND \$CONDITIONS
"""
engine = get_remote_engine(
site_name=site_name,
db_type=db_type
)
# spider_int,
# variat_num,
# REPLACE(REPLACE(REPLACE(img_list, E'\n',' '), E'\r',' '), E'\t',' ') AS img_list,
# REPLACE(REPLACE(REPLACE(parent_asin, E'\n',' '), E'\r',' '), E'\t',' ') AS parent_asin,
# REPLACE(REPLACE(REPLACE(video_m3u8_url, E'\n',' '), E'\r',' '), E'\t',' ') AS video_m3u8_url,
# REPLACE(REPLACE(REPLACE(review_json_list, E'\n',' '), E'\r',' '), E'\t',' ') AS review_json_list,
# fbm_delivery_price
engine.sqoop_raw_import(
# query=f"SELECT {cols} FROM {import_table} WHERE 1=1 and \$CONDITIONS",
query=query,
hive_table=hive_tb,
hdfs_path=hdfs_path,
partitions=partition_dict
hive_table=hive_table,
# hdfs_path=hdfs_path,
partitions=partition_dict,
# m=50,
# split_by='id'
)
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment