Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
27e0f356
Commit
27e0f356
authored
Apr 27, 2026
by
fangxingjun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
9044a5f3
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
48 additions
and
0 deletions
+48
-0
img_hdfs_index.py
Pyspark_job/img_search/img_hdfs_index.py
+4
-0
check_st_zr.py
Pyspark_job/listen_program/check_st_zr.py
+44
-0
No files found.
Pyspark_job/img_search/img_hdfs_index.py
View file @
27e0f356
...
@@ -66,6 +66,10 @@ class ImgHdfsIndex(Templates):
...
@@ -66,6 +66,10 @@ class ImgHdfsIndex(Templates):
pass
pass
def
save_data
(
self
):
def
save_data
(
self
):
with
self
.
engine_doris
.
begin
()
as
conn
:
sql_truncate
=
f
"truncate {self.db_save}"
print
(
f
"sql_truncate: {sql_truncate}"
)
conn
.
execut
(
sql_truncate
)
self
.
df_save
.
to_sql
(
self
.
db_save
,
con
=
self
.
engine_doris
,
if_exists
=
"append"
,
index
=
False
)
self
.
df_save
.
to_sql
(
self
.
db_save
,
con
=
self
.
engine_doris
,
if_exists
=
"append"
,
index
=
False
)
def
run
(
self
):
def
run
(
self
):
...
...
Pyspark_job/listen_program/check_st_zr.py
0 → 100644
View file @
27e0f356
import
os
import
sys
import
time
import
traceback
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_type
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
3
,
None
)
db_type
=
'postgresql_14'
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
check_table
=
f
"{site_name}_search_term_{date_type.replace('_week', '')}_syn"
def
check_syn
():
while
True
:
try
:
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
sql_check_syn
=
f
"select * from {check_table} where date_info='{date_info}' and state in (1, 2) limit 100"
df
=
engine
.
read_sql
(
sql_check_syn
)
if
df
.
shape
[
0
]
>
0
:
print
(
f
"搜索词-爬虫还未抓完, 等待5分钟继续"
)
time
.
sleep
(
300
)
continue
else
:
print
(
"搜索词-爬虫已经全部抓取完成, 可以同步数据"
)
break
except
Exception
as
e
:
print
(
f
"搜索词-检查asin是否全部抓取完成报错, 报错信息: {e}, {traceback.format_exc()}"
)
time
.
sleep
(
300
)
check_syn
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment