def load_data(em0, em1, em2, em3, em4, em5, em6): # em0,em1,em2,em3,em4,em5,em6=Domain() # words=createVocablulary(em0+em1+em2+em3+em4+em5+em6) test = em0[:72] + em1[: 72] + em2[: 72] + em3[: 72] + em4[: 72] + em5[: 72] + em6[: 72] # 前90(18)个作为测试样本 val = em0[-30:] + em1[-30:] + em2[-30:] + em3[-30:] + em4[-30:] + em5[ -30:] + em6[-30:] # 验证集 后60个 em2_new = [] for i in range(360 - len(em2)): em2.append(random.choice(em2[72:-30])) # re-sampling 扩充到 360 train=random.sample(em0[72:-30],258)+random.sample(em1[72:-30],258)+random.sample(em2[72:-30],258)+\ random.sample(em3[72:-30],258)+random.sample(em4[72:-30],258)+random.sample(em5[72:-30],258)+\ random.sample(em6[72:-30],258) words = createVocablulary(train + val) train_vec, train_label = createIndex(words, train) val_vec, val_label = createIndex(words, val) test_vec, test_label = createIndex(words, test) X_train = process(train_vec, nb_words=max_features) X_val = process(val_vec, nb_words=max_features) X_test = process(test_vec, nb_words=max_features) # X_train=np.array(X_train) # X_val=np.array(X_val) # X_test=np.array(X_test) print 'X_train:', len(X_train) print len(X_train[0]) print X_train[0] # print X_train[0] # print len(X_test) tokenizer = Tokenizer(nb_words=max_features) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') print len(X_train[0]) print X_train[0][:200] print X_train.shape X_val = tokenizer.sequences_to_matrix(X_val, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') X_train = np.reshape(X_train, (258 * 7, 1, max_features)) X_val = np.reshape(X_val, (30 * 7, 1, max_features)) X_test = np.reshape(X_test, (72 * 7, 1, max_features)) Y_train = np_utils.to_categorical(train_label, nb_classes) Y_test = np_utils.to_categorical(test_label, nb_classes) Y_val = np_utils.to_categorical(val_label, nb_classes) return X_train, X_test, Y_train, Y_test, X_val, Y_val, test_label