1

0133d620 · abel_cjy · 18dd364d · 0133d620 · 0133d620 · 0133d620
Commit 0133d620 authored Feb 07, 2025 by abel_cjy
4 changed files
--- a/tensor_flow/text_classification_model.h5
+++ b/tensor_flow/text_classification_model.h5
--- a/tensor_flow/tf_dome1.py
+++ b/tensor_flow/tf_dome1.py
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import psycopg2
+import pandas as pd
+"""
+一个简单的文本分类模型，用于预测商品是否与圣诞节相关 
+文本：    ASIN、站点、标题、描述、卖点
+子文本（买家评论）：  
+图片/视频：ASIN、站点、首图、子图、A+、视频
+大数据模型训练流程 
+1.处理大量数据，使用 hadoop、spark 等计算框架来处理、分析、挖掘这些数据。  涉及到：存储、清洗、分析、等方面
+2.下面的案例更侧重于机器学习和深度学习的应用，以及使用TensorFlow和Keras构建文本分类模型
+"""
+#用于判断产品是否与圣诞相关。
+def is_christmas_product(row):
+    keywords = ['christmas', 'christmas tree', 'christmas gifts', 'christmas decorations', 'christmas celebration',
+                'santa claus', 'winter holiday', 'christmas shopping', 'holiday party']
+    title = str(row['title']).lower() if pd.notna(row['title']) else ''
+    describe = str(row['describe']).lower() if pd.notna(row['describe']) else ''
+    product_description = str(row['product_description']).lower() if pd.notna(row['product_description']) else ''
+    return any((keyword in title or keyword in describe or keyword in product_description) for keyword in keywords)
+#用于判断产品是否包含圣诞装饰相关关键词
+def has_christmas_decorations(row):
+    keywords = ['decorations', 'ornaments', 'festive decor', 'holiday decor', 'christmas adornments']
+    return any(keyword in str(row['title']).lower() or keyword in str(row['describe']).lower() or keyword in str(row['product_description']).lower() for keyword in keywords)
+# 定义判断条件的函数
+def is_condition_met(row):
+    return 'keyword' in str(row['title']) and 'keyword' in str(row['product_description']) and 'keyword' in str(row['describe'])
+# 连接 PostgreSQL 数据库
+connection = psycopg2.connect(
+    host="192.168.10.223",
+    port="5432",
+    database="selection",
+    user="postgres",
+    password="fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS"
+)
+# 从数据库读取数据
+query = "select  asin,title,product_description,describe  from us_asin_detail_month_2023_12  where date_info ='2023-12'  and spider_int = 1  and product_description !=''   limit 1000"
+df = pd.read_sql(query, connection)
+# 关闭数据库连接
+connection.close()
+# 过滤掉空数据：通过筛选，保留 title、product_description 或 describe 字段中不为空的数据。
+df = df[(df['title'].notna() & (df['title'] != '')) |
+        (df['product_description'].notna() & (df['product_description'] != '')) |
+        (df['describe'].notna() & (df['describe'] != ''))]
+# 将满足条件的数据标记为 True
+df['meets_condition'] = df.apply(is_condition_met, axis=1)
+#添加新的特征列
+df['is_christmas'] = df.apply(is_christmas_product, axis=1)
+#划分训练集和测试集
+train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+train_df['is_christmas'] = train_df.apply(is_christmas_product, axis=1).fillna(False)
+# 在数据集中添加特征列
+df['has_christmas_decorations'] = df.apply(has_christmas_decorations, axis=1)
+# 使用训练集的一部分作为测试集
+test_df = train_df.sample(frac=1, random_state=42)
+test_df['is_christmas'] = test_df.apply(lambda row: is_christmas_product(row) if pd.notna(row['title']) or pd.notna(row['product_description']) or pd.notna(row['describe']) else False, axis=1)
+# 合并文本字段
+train_texts = train_df['describe'] + ' ' + train_df['title'] + ' ' + train_df['product_description']
+test_texts = test_df['describe'] + ' ' + test_df['title'] + ' ' + test_df['product_description']
+train_texts = train_texts.fillna('')  # 将 NaN 值替换为空字符串
+train_texts = train_texts.dropna()
+test_texts = test_texts.astype(str)
+# 使用Tokenizer构建词汇表
+tokenizer = Tokenizer()
+tokenizer.fit_on_texts(train_texts)
+# 将文本序列转化为数字序列
+train_sequences = tokenizer.texts_to_sequences(train_texts)
+test_sequences = tokenizer.texts_to_sequences(test_texts)
+# 填充序列保证长度一致
+max_length = 150  # 你可以根据实际情况调整
+train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
+test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')
+#构建模型： 使用TensorFlow和Keras构建一个简单的深度学习文本分类模型
+#包括Embedding层:是一种用于将高纬度数据（如文本、图片、音频）映射到低维度空间的机器学习方法。这种模型通过创建一个由实数构成的多维向量来表示输入数据，使得每个数据点都对应于一个连续数值空间中的一个具体位置。
+# Embedding向量可以用于捕捉数据的结构化信息，尤其是在自然语言处理（NLP）和计算机视觉、等领域，它们帮助算法理解不同类型数据之间的相互关系
+# GlobalAveragePooling1D 层:
+# Dense层:是一个全连接层
+model = tf.keras.Sequential([
+    #input_dim： 表示词汇表中的单词数目，加 1 是因为词汇表的索引是从 1 开始的，而不是从 0。
+    #output_dim：表示嵌入向量的维度，即每个单词将被映射为一个具有维度的向量
+    #input_length：表示输入序列的长度。指定的文本序列的最大长度
+    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length),
+    #GlobalAveragePooling1D：用于将每个时间步的输出平均汇聚成一个固定长度的向量。在文本分类任务中，它通常用于处理变长序列，将不同长度的文本转换为固定长度的表示
+    tf.keras.layers.GlobalAveragePooling1D(),
+    #tf.keras.layers.Dense(128, activation='relu'),  # 调整神经元数量和激活函数
+    #是一个全连接层，其中包含 128 个神经元，并使用双曲正切（tanh）作为激活函数
+    tf.keras.layers.Dense(128, activation='tanh'),
+    #tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),  # 添加 L2 正则化
+    # Dense：这是神经网络的输出层，包含一个神经元。激活函数选择了 sigmoid。在二分类问题中，sigmoid 函数常用于输出层，因为它将输出值映射到 0 到 1 之间，可以看作是概率值，表示样本属于正类别的概率。
+    #这两层分开定义的目的是为了能够更灵活地定制神经网络的架构。如果需要对中间层进行调整，添加更多的隐藏层或者更改神经元数量，可以在这两层之间插入新的全连接层。最后一层的选择和设置是根据问题类型和模型的输出进行调整的
+    # 因为该案例：是一个二分类问题，因此使用了 sigmoid 激活函数。
+    tf.keras.layers.Dense(1, activation='sigmoid')
+])
+# 调整学习率
+optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
+# 编译模型: optimizer优化器， adam是一种常用的优化算法， 该算法就是综合其他优化算法的优点
+#loss（损失函数）： 用于衡量模型在训练过程中预测值与真实值之间的差异。在二分类问题中，一般使用 'binary_crossentropy' 作为损失函数。对于多分类问题，可能会选择 'categorical_crossentropy'。
+#metrics（评估指标）： 用于在训练和评估过程中监控模型性能的指标。在这里，我们使用 'accuracy'，表示模型在训练和评估过程中将准确率作为一个监测指标。
+model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
+# 训练模型
+model.fit(train_padded, train_df['is_christmas'], epochs=5, validation_data=(test_padded, test_df['is_christmas']))
+#保存模型结构和权重
+model.save('text_classification_model.h5')
+#使用模型进行预测
+predictions = model.predict(test_padded)
+#设置阈值
+threshold = 0.5
+# 存储匹配的 ASIN 和字段信息
+matching_asins_info = []
+# 遍历测试集中的每个样本
+for i in range(len(test_df)):
+    # 获取测试样本的信息
+    asin = test_df["asin"].iloc[i]
+    is_christmas_label = test_df["is_christmas"].iloc[i]
+    # 如果模型预测为与圣诞节相关
+    if predictions[i][0] > threshold:
+        print('asin:',asin)
+        # 记录匹配的 ASIN 和字段信息
+        matching_asins_info.append({
+            'ASIN': asin,
+            'IsChristmasLabel': is_christmas_label,
+            'PredictedLabel': 1,  # 1 表示预测为相关
+            'Probabilities': predictions[i],
+            'MatchingFields': {
+                'Title': test_df['title'].iloc[i],
+                'Description': test_df['describe'].iloc[i],
+                'ProductDescription': test_df['product_description'].iloc[i]
+            }
+        })
+output_file_path = 'E:/BaiduNetdiskDownload/选品大数据/推荐系统/matching_asins_info.txt'
+with open(output_file_path, 'w', encoding='utf-8') as output_file:
+    for info in matching_asins_info:
+        output_file.write(f"ASIN: {info['ASIN']}\n")
+        output_file.write(f"Is Christmas Label: {info['IsChristmasLabel']}\n")
+        matched_field = 'Title'  # 默认为 Title
+        if info['MatchingFields']['Description']:
+            matched_field = 'Description'
+        elif info['MatchingFields']['ProductDescription']:
+            matched_field = 'Product Description'
+        output_file.write(f"Matched Field: {matched_field}\n")
+        output_file.write(f"Matched Content: {info['MatchingFields'][matched_field]}\n")
+        output_file.write('-' * 50 + '\n')
--- a/tensor_flow/tf_dome2.py
+++ b/tensor_flow/tf_dome2.py
+import pandas as pd
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import psycopg2
+# 加载之前保存的模型
+from tensor_flow.tf_dome1 import tokenizer, max_length
+model = load_model('text_classification_model.h5')
+#连接 PostgreSQL 数据库
+connection = psycopg2.connect(
+    host="192.168.10.223",
+    port="5432",  # 替换成你的 PostgreSQL 端口
+    database="selection",
+    user="postgres",
+    password="fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS"
+)
+#从数据库读取数据
+query = "select  asin,title,product_description,describe  from us_asin_detail_month_2023_12  where date_info ='2023-12'  and spider_int = 2  and product_description !=''  limit " \
+        ""
+new_data = pd.read_sql(query, connection)
+#关闭数据库连接
+connection.close()
+#合并文本字段
+new_texts = new_data['describe'] + ' ' + new_data['title'] + ' ' + new_data['product_description']
+new_texts = new_texts.fillna('')
+new_texts = new_texts.astype(str)
+#使用之前的 Tokenizer 对象进行文本序列转化为数字序列，将新的文本数据转换为数字序列
+new_sequences = tokenizer.texts_to_sequences(new_texts)
+#将上一步得到的整数序列进行填充（padding）和截断（truncating），使得它们具有相同的长度
+#这样处理之后，new_padded 就是填充和截断后的数字序列，可以输入到神经网络模型中进行预测。这个过程保证了输入数据的一致性，使得所有的输入序列长度都是相同的，方便模型的处理。
+new_padded = pad_sequences(new_sequences, maxlen=max_length, padding='post', truncating='post')
+# 使用模型进行新的预测
+new_predictions = model.predict(new_padded)
+# 设置阈值
+threshold = 0.5
+# 存储匹配的 ASIN 和字段信息
+matching_asins_info = []
+# 遍历新数据中的每个样本
+for i in range(len(new_data)):
+    # 获取样本信息
+    asin = new_data["asin"].iloc[i]
+    print('new_predictions[0]:',new_predictions[i][0])
+    # 如果模型预测为与圣诞节相关
+    if new_predictions[i][0] >= threshold:
+        print('asin:', asin)
+        # 记录匹配的 ASIN 和字段信息
+        matching_asins_info.append({
+            'ASIN': asin,
+            'Probabilities': new_predictions[i],
+            'MatchingFields': {
+                'Title': new_data['title'].iloc[i],
+                'Description': new_data['describe'].iloc[i],
+                'ProductDescription': new_data['product_description'].iloc[i]
+            }
+        })
+# 将匹配的信息保存到文件
+output_file_path = 'E:/BaiduNetdiskDownload/选品大数据/推荐系统/matching_asins_info2.csv'
+with open(output_file_path, 'w', encoding='utf-8') as output_file:
+    for info in matching_asins_info:
+        output_file.write(f"ASIN: {info['ASIN']}\n")
+        #output_file.write(f"Is Christmas Label: {info['IsChristmasLabel']}\n")
+        matched_field = 'Title'  # 默认为 Title
+        if info['MatchingFields'].get('Description'):
+            matched_field = 'Description'
+        elif info['MatchingFields'].get('ProductDescription'):
+            matched_field = 'Product Description'
+        output_file.write(f"Matched Field: {matched_field}\n")
+        # 检查键是否存在，如果存在则写入，否则跳过
+        matched_content = info['MatchingFields'].get(matched_field, 'N/A')
+        output_file.write(f"Matched Content: {matched_content}\n")
+        output_file.write('-' * 50 + '\n')
\ No newline at end of file
--- a/tensor_flow/tokenizer.pickle
+++ b/tensor_flow/tokenizer.pickle