import os import sys import time import traceback import pandas as pd os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" sys.path.append(os.path.dirname(sys.path[0])) # 上级目录 from utils.templates_mysql import TemplatesMysql class PicturesLocal(object): def __init__(self, site_name='us', dir_path='/mnt/data/img_data/us/', self_flag='', platform='temu'): self.site_name = site_name self.dir_path = dir_path self.self_flag = self_flag self.platform = platform self.dir_path_list = os.listdir(dir_path) self.file_path_list = dir_path self.asin_list = dir_path self.engine = TemplatesMysql().engine def list_files(self, dir_path): file_path_list = [] for dirpath, dirnames, filenames in os.walk(dir_path): for filename in filenames: file_path = os.path.join(dirpath, filename) # print("dirpath, dirnames, filenames:", dirpath, dirnames, filenames) # print(file_path) asin = file_path.split("/")[-1].replace(".jpg", "") file_path_list.append([asin, file_path]) df = pd.DataFrame(file_path_list, columns=['asin', 'local_path']) print(df.shape) return df def save_data(self, df): df.to_sql(f"{self.site_name}_pictures_local_path{self.self_flag}", con=self.engine, if_exists="append", index=False) def run(self): if self.self_flag: sql = f"select * from {self.site_name}_pictures_local_path{self.self_flag};" df_read = pd.read_sql(sql, con=self.engine) print("df_read.shape:", df_read.shape) site_list = os.listdir(self.dir_path) site_list = [site for site in site_list if".jpg" not in site] print(site_list) self.asin_list = [] self.dir_path_list = [] for site in site_list: print(f"{self.dir_path}/{site}") dir_list = os.listdir(f"{self.dir_path}/{site}") asin_list = [asin.replace(".jpg", "") for asin in dir_list] print("dir_list:", dir_list[:10]) print("asin_list:", asin_list[:10]) dir_path_list = [os.path.join(self.dir_path, f"{site}/", dir_name) for dir_name in dir_list] # self.asin_list = [asin.replace(".jpg", "") for asin in dir_list] # self.dir_path_list = [os.path.join(self.dir_path, dir_name) for dir_name in dir_list] self.asin_list.extend(asin_list) self.dir_path_list.extend(dir_path_list) df = pd.DataFrame({'asin': self.asin_list, 'local_path': self.dir_path_list}) print(df.shape) print(df.head()) df['platform'] = self.platform print("df.shape:", df.shape) df = df.loc[~df.asin.isin(df_read.asin)] print("df.shape:", df.shape) self.save_data(df=df) quit() for dir_name in self.dir_path_list: while True: try: dir_path = self.dir_path + dir_name print("当前存储的图片路径:", dir_path) df = self.list_files(dir_path=dir_path) self.save_data(df=df) break except Exception as e: print(e, traceback.format_exc()) self.engine = TemplatesMysql().engine time.sleep(10) continue if __name__ == '__main__': platform = sys.argv[1] # 参数1:平台 # handle_obj = PicturesLocal(dir_path='/mnt/data/img_data/us/') # handle_obj = PicturesLocal(dir_path='/mnt/data/img_data/temu/', self_flag="_self", platform='temu') handle_obj = PicturesLocal(dir_path=f'/mnt/data/img_data/{platform}', self_flag="_self", platform=platform) handle_obj.run()