 
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder


# 构建示例数据。包括了字符串特征变量、数值特征变量
X = pd.DataFrame(
    {'city': ['London', 'London', 'Paris', 'Sallisaw'],
     'title': ["His Last Bow", "How Watson Learned the Trick",
               "A Moveable Feast", "The Grapes of Wrath"],
     'expert_rating': [5, 3, 4, 5],
     'user_rating': [4, 5, 4, 3]})


#1
# 下面代码使用OneHotEncoder()把列“ciy”转换为分类变量；
# 使用CountVectorizer()把列“title”中的词语转换为词频矩阵
# 声明ColumnTransformer对象
column_trans1 = ColumnTransformer(
    [('categories', OneHotEncoder(dtype='int'), ['city']),
     ('title_bow', CountVectorizer(), 'title')],
    remainder='drop', verbose_feature_names_out=False)

# 训练ColumnTransformer对象
column_trans1.fit(X)

# 输出转换后的特征变量
out_col_names = column_trans1.get_feature_names_out()
print("转换后特征变量的名称：\n", out_col_names , "\n")

# 便于观察，转为数组形式
X_new = column_trans1.transform(X)
print(X_new.toarray())
print("-"*30, "\n")


#2
# 基于特征名称模式、数据类型等条件，使用方法make_column_selector()选择列（特征变量）
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector

# 默认remainder='drop'
column_trans2 = ColumnTransformer([
      ('scale', StandardScaler(),
      make_column_selector(dtype_include=np.number)),
      ('onehot',
      OneHotEncoder(),
      make_column_selector(pattern='city', dtype_include=object))])

# 训练ColumnTransformer对象
column_trans2.fit(X)

# 输出转换后的特征变量
out_col_names = column_trans2.get_feature_names_out()
print("转换后特征变量的名称：\n", out_col_names , "\n")

#
X_new = column_trans2.fit_transform(X)
print(X_new)
print("-"*37, "\n")


#3
# 设置 remainder="passthrough", 则剩余其他列（特征变量）会添加到转换结果中
column_trans3 = ColumnTransformer(
    [('city_category', OneHotEncoder(dtype='int'),['city']),
     ('title_bow', CountVectorizer(), 'title')],
    remainder='passthrough')

# 训练ColumnTransformer对象
column_trans3.fit(X)

# 输出转换后的特征变量
out_col_names = column_trans3.get_feature_names_out()
print("转换后特征变量的名称：\n", out_col_names , "\n")

#
X_new = column_trans3.fit_transform(X)
print(X_new)
print("-"*45, "\n")


#4
# 设置 remainder为一个转换器, 则剩余其他列（特征变量）会用于这个转换器
from sklearn.preprocessing import MinMaxScaler

column_trans4 = ColumnTransformer(
    [('city_category', OneHotEncoder(), ['city']),
     ('title_bow', CountVectorizer(), 'title')],
    remainder=MinMaxScaler())

# 训练ColumnTransformer对象
column_trans4.fit(X)

# 输出转换后的特征变量
out_col_names = column_trans4.get_feature_names_out()
print("转换后特征变量的名称：\n", out_col_names , "\n")

#
X_new = column_trans4.fit_transform(X)[:, -2:]
print(X_new)
print("-"*51)
 
