数据挖掘 - TensorFlow 和分类变量 - 吾爱随笔录

我正在尝试在具有一些分类变量的数据集上使用 TensorFlow。我已经用假人对它们进行了编码，但看起来它会造成麻烦，并且 TF 抱怨数据集不密集。

还是错误的原因完全不同？

我正在尝试运行一个带有 1 个具有随机梯度的隐藏层的简单神经网络模型。当输入是数字变量（来自 MNIST 的数字图像）时，代码正在工作

谢谢

-------------------------------------------------- ------------------------ ValueError Traceback (last last call last) in () 37 return(test_acc,round(l,5)) 38 -- -> 39 定义批处理（0.005） 40 运行批处理（）

在define_batch(beta) 11 shape=(batch_size, num_var)) 12 tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) ---> 13 tf_valid_dataset = tf.constant(valid_dataset) 14 tf_test_dataset = tf .constant(test_dataset) 15

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/tensorflow/python/ops/constant_op.pyc in constant(value, dtype, shape, name) 159 tensor_value = attr_value_pb2.AttrValue( ) 160 tensor_value.tensor.CopyFrom( --> 161 tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape)) 162 dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype) 163 const_tensor = g.create_op(

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/tensorflow/python/framework/tensor_util.pyc in make_tensor_proto（值，dtype，形状）320 nparray = np.array（值， dtype=np_dt) 321 if list(nparray.shape) != _GetDenseDimensions(values): --> 322 raise ValueError("Argument must be a dense tensor: %s" % values) 323 # python/numpy 默认浮点类型是 float64 . 我们更喜欢 float32。324 if (nparray.dtype == np.float64) 并且 dtype 为无：

ValueError: Argument must be a dense tensor: 周二周三周四周五周六周日 CENTRAL \ 736114
0.0 0.0 0.0 0.0 1.0 0.0 0.0 437148 0.0 0.0 1.0 0.0 0.0 0.0 0.0 605041 0.0 0.0 0.0 0.0 0.0 0.0 0.0 444608 0.0 0.0 0.0 0.0 1.0 0.0 0.0 695549 0.0 0.0 0.0 0.0 1.0 0.0 0.0 662807 0.0 0.0 0.0 1.0 0.0 0.0 0.0 238635 0.0 0.0 0.0 0.0 0.0 1.0 0.0 549524 0.0 0.0 0.0 1.0 0.0 0.0 0.0 705478 1.0 0.0 0.0 0.0 0.0 0.0 0.0 557716 0.0 0.0 0.0 1.0 0.0 0.0 0.0 41808 0.0 0.0 0.0 0.0 0.0 1.0 0.0 227235 1.0 0.0 0.0 0.0 0.0 0.0 0.0 848719 0.0 0.0 0.0 0.0 0.0 0.0 0.0 731202 0.0 0.0 0.0 0.0 1.0 0.0 0.0 467516 1.0 0.0 0.0 0.0 0.0 0.0 1.0

这是代码的摘录

# Adding regularization to the 1 hidden layer network
def define_batch(beta):
    batch_size = 128
    num_RELU =256
    graph1 = tf.Graph()
    with graph1.as_default():

      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, num_var))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      # Variables.
      weights_RELU = tf.Variable(
        tf.truncated_normal([num_var, num_RELU]))
      biases_RELU = tf.Variable(tf.zeros([num_RELU]))
      weights_layer1 = tf.Variable(
        tf.truncated_normal([num_RELU, num_labels]))
      biases_layer1 = tf.Variable(tf.zeros([num_labels]))

      # Training computation.
      logits_RELU = tf.matmul(tf_train_dataset, weights_RELU) + biases_RELU
      RELU_vec = tf.nn.relu(logits_RELU)
      logits_layer = tf.matmul(RELU_vec, weights_layer1) + biases_layer1                  
      # loss = tf.reduce_mean(
      #        tf.nn.softmax_cross_entropy_with_logits(logits_layer, tf_train_labels))
      cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits_layer, tf_train_labels,name="cross_entropy")
      l2reg = tf.reduce_sum(tf.square(weights_RELU))+tf.reduce_sum(tf.square(weights_layer1))
      beta = 0.005
      loss = tf.reduce_mean(cross_entropy+beta*l2reg)

  # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(0.3).minimize(loss)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits_layer)
      valid_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu((tf.matmul(tf_valid_dataset, weights_RELU) + biases_RELU)),weights_layer1)+biases_layer1)

      test_prediction =tf.nn.softmax(
        tf.matmul(tf.nn.relu((tf.matmul(tf_test_dataset, weights_RELU) + biases_RELU)),weights_layer1)+biases_layer1)

import datetime

startTime = datetime.datetime.now() 

num_steps = 301 # change to 3001

def run_batch():

    with tf.Session(graph=graph1) as session:

      tf.initialize_all_variables().run()
      print("Initialized")
      for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch. 
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions, logits = session.run(
          [optimizer, loss,train_prediction,logits_RELU], feed_dict=feed_dict)
        if (step % 500 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy(
            valid_prediction.eval(), valid_labels))
      test_acc = accuracy(test_prediction.eval(), test_labels)
      print("Test accuracy: %.1f%%" % test_acc)

      print('loss=%s' % l)
    x = datetime.datetime.now() - startTime
    print(x)
    return(test_acc,round(l,5))

define_batch(0.005)
run_batch()

编辑：@gdhal 感谢您查看它

train_dataset是熊猫数据框

train_dataset.columns
Index([u'Tuesday', u'Wednesday', u'Thursday', u'Friday', u'Saturday',
       u'Sunday', u'CENTRAL', u'INGLESIDE', u'MISSION', u'NORTHERN', u'PARK',
       u'RICHMOND', u'SOUTHERN', u'TARAVAL', u'TENDERLOIN', u' 3H - 4H',
       u' 5H - 6H', u' 7H - 8H', u' 9H - 10H', u'11H - 12H', u'13H - 14H',
       u'15H - 16H', u'17H - 18H', u'19H - 20H', u'21H - 22H', u'23H - 0H',
       u'Xnorm', u'Ynorm', u'Hournorm'],
      dtype='object')

除了最后 3 个变量（Xnorm、Ynorm 和 Hournorm）之外，所有变量都是虚拟变量（取 0 或 1 个值），它们是标准化为 [0,1] 区间的数值。valid_dataset并test_dataset具有相同的格式

train_labels是熊猫系列

train_labels.describe()

count            790184
unique               39
top       LARCENY/THEFT
freq             157434
Name: Category, dtype: object

valid_labels, 并test_labels具有相同的格式