#注意:为了方便分析,建议把代码文件(.ipynb)和数据文件(xls,csv,txt等)放到同一个文件夹/目录
数据分析基本操作
#载入分析模块
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from mlxtend.frequent_patterns import apriori
from transformers import pipeline
# 一.多格式数据导入
import pandas as pd
# CSV文件读取
data1 = pd.read_csv('上海餐饮数据.csv')
#查看读取的数据
data1.head()
# Excel文件读取
data2 = pd.read_excel('中医辨证.xlsx')
#查看读取的数据
data2.head()
# TXT文件读取
data3= open('news1.txt', encoding = 'utf-8')
#查看读取的数据
content = data3.read()
print(content) # 输出全部内容
二.常见数据分析方法
# 2. 关联规则分析
def association_analysis(df):
basket = df.groupby(['order_id','product_name'])['quantity'].sum().unstack()
basket = basket.fillna(0).applymap(lambda x: 1 if x>0 else 0)
frequent_items = apriori(basket, min_support=0.05, use_colnames=True)
return frequent_items.sort_values('support', ascending=False)
# 3. 分类预测
def classification(X_train, y_train):
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
return clf
# 4. 聚类分析
def clustering(X):
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=4)
clusters = kmeans.fit_predict(X_scaled)
return clusters
# 5. 神经网络建模
def neural_net(X_train, y_train):
mlp = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=500)
mlp.fit(X_train, y_train)
return mlp
# 6. 表征学习
def representation_learning(texts):
tfidf = TfidfVectorizer(max_features=1000)
tfidf_vectors = tfidf.fit_transform(texts)
pca = PCA(n_components=50)
reduced_vectors = pca.fit_transform(tfidf_vectors.toarray())
return reduced_vectors
# 7. 情感分析
def sentiment_analysis(reviews):
classifier = pipeline("text-classification", model="bert-base-chinese")
results = classifier(reviews)
return pd.DataFrame(results)
# 主程序
if __name__ == "__main__":
# 数据加载
orders, products, reviews = load_data()
# 关联分析示例
frequent_items = association_analysis(orders)
# 分类示例(预测客户是否会回购)
X_class = products[['price','sales']]
y_class = products['repurchase']
clf_model = classification(X_class, y_class)
# 聚类示例(客户分群)
X_cluster = orders.groupby('user_id').agg({'price':'sum','quantity':'count'})
user_clusters = clustering(X_cluster)
# 神经网络示例
nn_model = neural_net(X_class, y_class)
# 表征学习示例
product_vectors = representation_learning(products['description'])
# 情感分析示例
sentiment_results = sentiment_analysis(reviews[:100]) # 限制数量避免内存问题

