Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
6dd7a66e
Commit
6dd7a66e
authored
Jun 04, 2026
by
fangxingjun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复以图搜图导入doris数据不一致问题
parent
79a384c1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
49 additions
and
41 deletions
+49
-41
img_id_index_to_doris.py
Pyspark_job/img_search/img_id_index_to_doris.py
+49
-41
No files found.
Pyspark_job/img_search/img_id_index_to_doris.py
View file @
6dd7a66e
import
ast
import
datetime
import
logging
import
os
import
os
import
re
import
sys
import
sys
import
threading
sys
.
path
.
append
(
"/opt/module/spark-3.2.0-bin-hadoop3.2/demo/py_demo/"
)
import
time
import
traceback
import
pandas
as
pd
import
redis
from
pyspark.sql.types
import
ArrayType
,
FloatType
os
.
environ
[
"PYARROW_IGNORE_TIMEZONE"
]
=
"1"
os
.
environ
[
"PYARROW_IGNORE_TIMEZONE"
]
=
"1"
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
# 上级目录
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
# 上级目录
from
utils.templates_mysql
import
TemplatesMysql
from
utils.templates
import
Templates
from
utils.templates
import
Templates
# from ..utils.templates import Templates
from
utils.secure_db_client
import
get_remote_engine
from
py4j.java_gateway
import
java_import
from
sqlalchemy
import
text
from
pyspark.sql
import
functions
as
F
import
pyarrow
as
pa
import
pyarrow.parquet
as
pq
from
multiprocessing
import
Process
from
multiprocessing
import
Pool
import
multiprocessing
from
utils.db_util
import
DbTypes
,
DBUtil
from
utils.db_util
import
DbTypes
,
DBUtil
from
utils.
StarRocksHelper
import
StarRock
sHelper
from
utils.
DorisHelper
import
Dori
sHelper
class
ImgIdIndexToDoris
(
Templates
):
class
ImgIdIndexToDoris
(
Templates
):
...
@@ -36,10 +16,12 @@ class ImgIdIndexToDoris(Templates):
...
@@ -36,10 +16,12 @@ class ImgIdIndexToDoris(Templates):
self
.
site_name
=
site_name
self
.
site_name
=
site_name
self
.
img_type
=
img_type
self
.
img_type
=
img_type
self
.
engine_doris
=
DBUtil
.
get_db_engine
(
db_type
=
DbTypes
.
doris
.
name
,
site_name
=
self
.
site_name
)
self
.
engine_doris
=
DBUtil
.
get_db_engine
(
db_type
=
DbTypes
.
doris
.
name
,
site_name
=
self
.
site_name
)
self
.
table_name
=
"img_dwd_id_index"
self
.
doris_table
=
"img_id_index_copy"
self
.
db_save
=
self
.
table_name
self
.
spark
=
self
.
create_spark_object
(
app_name
=
f
"{self.db_save}: {self.site_name}"
)
self
.
spark
=
self
.
create_spark_object
(
app_name
=
f
"{self.db_save}: {self.site_name}"
)
self
.
df_id_index
=
self
.
spark
.
sql
(
f
"select 1+1;"
)
self
.
df_id_index
=
self
.
spark
.
sql
(
f
"select 1+1;"
)
self
.
table_name
=
"img_dwd_id_index"
self
.
doris_db
=
"selection"
self
.
table_save
=
"img_id_index_copy"
def
read_data
(
self
):
def
read_data
(
self
):
sql
=
f
"select id, index, img_unique, site_name, img_type from {self.table_name} where site_name='{self.site_name}' and img_type = '{self.img_type}';"
sql
=
f
"select id, index, img_unique, site_name, img_type from {self.table_name} where site_name='{self.site_name}' and img_type = '{self.img_type}';"
...
@@ -48,24 +30,50 @@ class ImgIdIndexToDoris(Templates):
...
@@ -48,24 +30,50 @@ class ImgIdIndexToDoris(Templates):
self
.
df_id_index
.
show
(
10
)
self
.
df_id_index
.
show
(
10
)
print
(
f
"self.df_id_index.count(): {self.df_id_index.count()}"
)
print
(
f
"self.df_id_index.count(): {self.df_id_index.count()}"
)
def
handle_data
(
self
):
# def save_data(self):
pass
# # starrocks_url = "jdbc:mysql://192.168.10.151:19030/selection"
# # properties = {
# # "user": "fangxingjun",
# # "password": "fangxingjun12345",
# # "driver": "com.mysql.cj.jdbc.Driver",
# # # "driver": "com.mysql.cj.jdbc.Driver",
# # }
# # self.df_id_index.write.jdbc(url=starrocks_url, table="image_id_index", mode="overwrite", properties=properties)
# # self.df_id_index = self.df_id_index.withColumn('created_time', F.lit(datetime.datetime.now()))
# # self.df_id_index = self.df_id_index.withColumn("img_type", F.col("img_type").cast("int"))
# # StarRocksHelper.spark_export(df_save=self.df_id_index, db_name='selection', table_name='image_id_index')
# df_save = self.df_id_index.toPandas()
#
# df_save.to_sql(self.table_save, con=self.engine_doris, if_exists="append", index=False, chunksize=10000)
def
truncate_data
(
self
):
engine
=
get_remote_engine
(
site_name
=
'us'
,
# -> database "selection"
db_type
=
"doris"
,
# -> 服务端 alias "mysql"
)
sql_truncate
=
f
"truncate table {self.doris_table};"
print
(
f
"清空最新导入之前的数据, sql_truncate: {sql_truncate}"
)
engine
.
execute
(
sql_truncate
)
def
save_data
(
self
):
def
run
(
self
):
# starrocks_url = "jdbc:mysql://192.168.10.151:19030/selection"
self
.
read_data
()
# properties = {
self
.
truncate_data
()
# "user": "fangxingjun",
df
=
self
.
df_id_index
# "password": "fangxingjun12345",
count
=
df
.
count
()
# "driver": "com.mysql.cj.jdbc.Driver",
print
(
f
"读取完成,数据量:{count}"
)
# # "driver": "com.mysql.cj.jdbc.Driver",
df
.
show
(
10
,
truncate
=
False
)
# }
# self.df_id_index.write.jdbc(url=starrocks_url, table="image_id_index", mode="overwrite", properties=properties)
# self.df_id_index = self.df_id_index.withColumn('created_time', F.lit(datetime.datetime.now()))
# self.df_id_index = self.df_id_index.withColumn("img_type", F.col("img_type").cast("int"))
# StarRocksHelper.spark_export(df_save=self.df_id_index, db_name='selection', table_name='image_id_index')
df_save
=
self
.
df_id_index
.
toPandas
()
df_save
.
to_sql
(
self
.
table_save
,
con
=
self
.
engine_doris
,
if_exists
=
"append"
,
index
=
False
,
chunksize
=
10000
)
TABLE_COLUMNS
=
"img_unique,site_name,index,id,img_type"
# ===== Step 2:写入 Doris selection.sys_edit_log =====
print
(
f
"[2/2] 写入 Doris {self.doris_db}.{self.doris_table}"
)
DorisHelper
.
spark_export_with_columns
(
df_save
=
df
,
db_name
=
self
.
doris_db
,
table_name
=
self
.
doris_table
,
table_columns
=
TABLE_COLUMNS
,
use_type
=
'selection'
,
)
print
(
"success"
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment