1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import sys
import time
import traceback
import pandas as pd
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates_mysql import TemplatesMysql
class PicturesLocal(object):
def __init__(self, site_name='us', dir_path='/mnt/data/img_data/us/', self_flag='', platform='temu'):
self.site_name = site_name
self.dir_path = dir_path
self.self_flag = self_flag
self.platform = platform
self.dir_path_list = os.listdir(dir_path)
self.file_path_list = dir_path
self.asin_list = dir_path
self.engine = TemplatesMysql().engine
def list_files(self, dir_path):
file_path_list = []
for dirpath, dirnames, filenames in os.walk(dir_path):
for filename in filenames:
file_path = os.path.join(dirpath, filename)
# print("dirpath, dirnames, filenames:", dirpath, dirnames, filenames)
# print(file_path)
asin = file_path.split("/")[-1].replace(".jpg", "")
file_path_list.append([asin, file_path])
df = pd.DataFrame(file_path_list, columns=['asin', 'local_path'])
print(df.shape)
return df
def save_data(self, df):
df.to_sql(f"{self.site_name}_pictures_local_path{self.self_flag}", con=self.engine, if_exists="append", index=False)
def run(self):
if self.self_flag:
sql = f"select * from {self.site_name}_pictures_local_path{self.self_flag};"
df_read = pd.read_sql(sql, con=self.engine)
print("df_read.shape:", df_read.shape)
site_list = os.listdir(self.dir_path)
site_list = [site for site in site_list if".jpg" not in site]
print(site_list)
self.asin_list = []
self.dir_path_list = []
for site in site_list:
print(f"{self.dir_path}/{site}")
dir_list = os.listdir(f"{self.dir_path}/{site}")
asin_list = [asin.replace(".jpg", "") for asin in dir_list]
print("dir_list:", dir_list[:10])
print("asin_list:", asin_list[:10])
dir_path_list = [os.path.join(self.dir_path, f"{site}/", dir_name) for dir_name in dir_list]
# self.asin_list = [asin.replace(".jpg", "") for asin in dir_list]
# self.dir_path_list = [os.path.join(self.dir_path, dir_name) for dir_name in dir_list]
self.asin_list.extend(asin_list)
self.dir_path_list.extend(dir_path_list)
df = pd.DataFrame({'asin': self.asin_list, 'local_path': self.dir_path_list})
print(df.shape)
print(df.head())
df['platform'] = self.platform
print("df.shape:", df.shape)
df = df.loc[~df.asin.isin(df_read.asin)]
print("df.shape:", df.shape)
self.save_data(df=df)
quit()
for dir_name in self.dir_path_list:
while True:
try:
dir_path = self.dir_path + dir_name
print("当前存储的图片路径:", dir_path)
df = self.list_files(dir_path=dir_path)
self.save_data(df=df)
break
except Exception as e:
print(e, traceback.format_exc())
self.engine = TemplatesMysql().engine
time.sleep(10)
continue
if __name__ == '__main__':
platform = sys.argv[1] # 参数1:平台
# handle_obj = PicturesLocal(dir_path='/mnt/data/img_data/us/')
# handle_obj = PicturesLocal(dir_path='/mnt/data/img_data/temu/', self_flag="_self", platform='temu')
handle_obj = PicturesLocal(dir_path=f'/mnt/data/img_data/{platform}', self_flag="_self", platform=platform)
handle_obj.run()