Python Tokenizer.sequences_to_matrix Examples, keras.preprocessing.text.Tokenizer.sequences_to

Example #1

Show file

def text_encode(train, val, test, type='onehot', maxlen=20): label_dict = {'b': 0, 't': 1, 'e': 2, 'm': 3} train_label = train['CATEGORY'] val_label = val['CATEGORY'] test_label = test['CATEGORY'] for (key, value) in label_dict.items(): train_label = train_label.replace(key, value) val_label = val_label.replace(key, value) test_label = test_label.replace(key, value) train_label = to_categorical(train_label, num_classes=4) val_label = to_categorical(val_label, num_classes=4) test_label = to_categorical(test_label, num_classes=4) tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ") tokenizer.fit_on_texts(train['TITLE']) word_index = tokenizer.word_index vocab = tokenizer.word_index train_id = tokenizer.texts_to_sequences(train['TITLE']) val_id = tokenizer.texts_to_sequences(val['TITLE']) test_id = tokenizer.texts_to_sequences(test['TITLE']) if type == 'seq': train_id = pad_sequences(train_id, padding='post', maxlen=maxlen) val_id = pad_sequences(val_id, padding='post', maxlen=maxlen) test_id = pad_sequences(test_id, padding='post', maxlen=maxlen) return train_id, train_label, val_id, val_label, test_id, test_label, vocab, word_index else: train_onehot = tokenizer.sequences_to_matrix(train_id, mode='binary') val_onehot = tokenizer.sequences_to_matrix(val_id, mode='binary') test_onehot = tokenizer.sequences_to_matrix(test_id, mode='binary') return train_onehot, train_label, val_onehot, val_label, test_onehot, test_label, vocab, word_index

Example #2

Show file

def train_population(population): # Initialize the data set (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=max_words) num_classes = np.max(y_train) + 1 tokenizer = Tokenizer(num_words=max_words) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) # For graph purposes histories = [] for neural_network in population: # Create the model keras_model = create_keras_model(neural_network, num_classes) print(neural_network) # Train it history = keras_model.fit(X_train, y_train, batch_size=128, epochs=20, verbose=2, validation_data=(X_test, y_test)) # Score it score = keras_model.evaluate(X_test, y_test, verbose=0) if neural_network["accuracy"] == 0.: neural_network["accuracy"] = score[1] # Save it histories.append(history) return histories

Example #3

Show file

File:model_mlp.pyProject:manasRK/adv_ml_project

def mlp_model(X_train, y_train, X_test, y_test): tokenizer = Tokenizer(nb_words=1000) nb_classes = np.max(y_train) + 1 X_train = tokenizer.sequences_to_matrix(X_train, mode="freq") X_test = tokenizer.sequences_to_matrix(X_test, mode="freq") Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) print("Building model...") model = Sequential() model.add(Dense(512, input_shape=(max_len,))) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode='categorical') history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1) model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True) # print('Test score:', score[0]) # print('Test accuracy:', score[1]) pred_labels = model.predict_classes(X_test) # print pred_labels # print y_test accuracy = accuracy_score(y_test, pred_labels) precision, recall, f1, supp = precision_recall_fscore_support(y_test, pred_labels, average='weighted') print precision, recall, f1, supp return accuracy, precision, recall, f1

Example #4

Show file

File:baseline_reuters.pyProject:liqiangq/COMP423

def running_retuter(modelname): maxlen = 400 max_words = 10000 # 1. Loading started (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2) word_index = reuters.get_word_index(path="reuters_word_index.json") num_classes = np.max(y_train) + 1 # 2. pad_sequences keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0) if (modelname == 'cnn'): x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen) x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen) y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) elif(modelname == 'nn'): tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') x_test = tokenizer.sequences_to_matrix(x_test, mode='binary') y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) bulidModel(modelname, num_classes, x_test, y_test, x_train, y_train)

Example #5

Show file

def train(model, x_train, y_train, x_test, y_test): num_classes = np.max(y_train) + 1 tokenizer = Tokenizer(num_words=1000) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') x_test = tokenizer.sequences_to_matrix(x_test, mode='binary') y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) modelcheckpoint_callback = ModelCheckpoint("./best_reuters_model.h5", monitor='val_loss', mode='min', save_best_only=True, save_weights_only=True) history = model.fit(x_train, y_train, batch_size=32, epochs=5, verbose=1, validation_split=0.1, callbacks=[modelcheckpoint_callback]) score = model.evaluate(x_test, y_test, batch_size=32, verbose=1) print('Test score:', score[0]) print('Test accuracy:', score[1])

Example #6

Show file

def get_inputs(file_lst, split=0.2): all_tweets = [] i = 0 for file in file_lst: f = np.load(file) for t in f: all_tweets.append((t, i)) i += 1 shuffle(all_tweets) X_train, y_train = [], [] X_test, y_test = [], [] split_num = int(len(all_tweets) * split) for i in range(split_num): X_test.append(all_tweets[i][0]) y_test.append(all_tweets[i][1]) for i in range(split_num, len(all_tweets)): X_train.append(all_tweets[i][0]) y_train.append(all_tweets[i][1]) #tokenize data tokenizer = Tokenizer(num_words=5000) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') Y_train = np_utils.to_categorical(y_train, num_categories) Y_test = np_utils.to_categorical(y_test, num_categories) return (X_train, Y_train), (X_test, Y_test)

Example #7

Show file

File:imdb_ClassModel.pyProject:Peter-Chou/tensorflow-playground

 def _build_data(self): """ data preprocessing & graph input initialization args: train_dataset: tuple -- (x_train, y_train) test_dataset: tuple -- (x_test, y_test) """ # one-hot encode tokenizer = Tokenizer(num_words=1000) self._x_train = tokenizer.sequences_to_matrix(self._x_train, mode="binary") self._y_train = keras.utils.to_categorical(self._y_train, self.output_dim) self._x_test = tokenizer.sequences_to_matrix(self._x_test, mode="binary") self._y_test = keras.utils.to_categorical(self._y_test, self.output_dim) self.data_num = self._x_train.shape[0] with tf.name_scope("init"): self.x = tf.placeholder(tf.float32, shape=(None, 1000), name="x") self.y = tf.placeholder(tf.float32, shape=(None, 2), name="y") self.global_step = tf.get_variable("global_step", trainable=False, initializer=tf.constant(0))

Example #8

Show file

def preprocess_features(x_train, x_test, max_words): print('Vectorizing sequence data...') tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') x_test = tokenizer.sequences_to_matrix(x_test, mode='binary') print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) return x_train, x_test

Example #9

Show file

def main(): tweets = [['Trump is crazy'], ['trump is bitching all the asdasda in live'], ['Soccer is too slow'], ['Waste time in World Cup rum booze']] train_y = np.array([1, 1, 0, 0]) train_x = [x[0] for x in tweets] tokenizer = Tokenizer(num_words=max_words) print(train_x) tokenizer.fit_on_texts(train_x) dictionary = tokenizer.word_index print("dictionary: ", dictionary) def convert_text_to_index_array(text): # one really important thing that `text_to_word_sequence` does # is make all texts the same length -- in this case, the length # of the longest text in the set. result = [] for word in kpt.text_to_word_sequence(text): print("word: ", word) x = dictionary.get(word, 0) print("x: ", x) result.append(x) return result #return [dictionary[word] for word in kpt.text_to_word_sequence(text)] allWordIndices = [] for text in train_x: wordIndices = convert_text_to_index_array(text) allWordIndices.append(wordIndices) allWordIndices = np.asarray(allWordIndices) print("allWord 1: ", allWordIndices) train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary') print("train_x", train_x) print("type x: ", type(train_x)) print("type y: ", type(train_y)) # Sciki Learn clf = svm.SVC() clf.fit(train_x, train_y) pred_tweet = [ 'Trump is live asdasda tu eres juan', 'Trump is asdasda illary', 'Trump is slow Soccer asdasda' ] allWordIndices = [] for text in pred_tweet: wordIndices = convert_text_to_index_array(text) allWordIndices.append(wordIndices) allWordIndices = np.asarray(allWordIndices) print("allWord 2: ", allWordIndices) pred_X = tokenizer.sequences_to_matrix(allWordIndices, mode='binary') print("pred X: ", pred_X) P = clf.predict(pred_X) print("P: ", P)

Example #10

Show file

File:use_cuda_numba_jit_prange_parallel.pyProject:pannapat/slu-hpc

def prepare(maxlen, dataset_filename='./data/dataset.csv', use_bigram=False): # df = pd.read_csv('./data/dataset.csv') df = pd.read_csv(dataset_filename) X = df['NAME'] y = df['NATIONALITY'] num_classes = len(y.unique()) X_train_df, X_test_df, y_train_df, y_test_df = train_test_split( X, y, test_size=0.2, random_state=69) X_tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=False, char_level=True, oov_token=None) y_tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, char_level=False, oov_token=None) X_train = X_train_df.values.astype( str ) # Otherwise, there's an error when calling 'fit_on_texts' >> AttributeError: 'int' object has no attribute 'lower' X_test = X_test_df.values.astype( str ) # Otherwise, there's an error when calling 'fit_on_texts' >> AttributeError: 'int' object has no attribute 'lower' if use_bigram: X_train = bigrams(X_train) X_tokenizer.fit_on_texts(X_train) X_train = X_tokenizer.texts_to_sequences(X_train) X_test = X_tokenizer.texts_to_sequences(X_test) X_train = X_tokenizer.sequences_to_matrix(X_train, mode='tfidf') X_test = X_tokenizer.sequences_to_matrix(X_test, mode='tfidf') # encode from string labels to numerical labels label_encoder = LabelEncoder() y_train = label_encoder.fit_transform( y_train_df.values.astype(str)) # error without astype(str) y_test = label_encoder.transform(y_test_df.values.astype(str)) y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) # pad character sequences to have the same length X_train = sequence.pad_sequences(X_train, padding="post", maxlen=maxlen) X_test = sequence.pad_sequences(X_test, padding="post", maxlen=maxlen) max_features = len(X_tokenizer.word_counts) return [X_train, y_train, X_test, y_test, max_features, num_classes]

Example #11

Show file

def tfidf_process_ci_feats_keras(data,train_data,test_data,num_feats): y = train_data['Score'] tokenizer = Tokenizer(num_words=num_feats) tokenizer.fit_on_texts(data['cutted_Dis']) sequences = tokenizer.texts_to_sequences(train_data['cutted_Dis']) X= tokenizer.sequences_to_matrix(sequences, mode='tfidf') sequences1= tokenizer.texts_to_sequences(test_data['cutted_Dis']) test_hh= tokenizer.sequences_to_matrix(sequences1, mode='tfidf') print(X.shape) return X,test_hh,y

Example #12

Show file

File:classify.pyProject:eokulik/Text-Classification

def main(): (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2) word_index = reuters.get_word_index(path="reuters_word_index.json") print('# of Training Samples: {}'.format(len(x_train))) print('# of Test Samples: {}'.format(len(x_test))) num_classes = max(y_train) + 1 print('# of CLasses: {0}'.format(num_classes)) max_words = 10000 tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='count') x_test = tokenizer.sequences_to_matrix(x_test, mode='count') y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) print(x_train[0]) print(len(x_train[0])) print(max(x_train[0])) print(y_train[0]) print(len(y_train[0])) model = Sequential() model.add(Dense(512, input_shape=(max_words, ))) # model.add(Activation('relu')) model.add(Activation('exponential')) model.add(Dropout(0.5)) model.add(Dense(num_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.metrics_names) batch_size = 32 epochs = 2 model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1) score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) print('Test loss:', score[0]) print('Test accuracy:', score[1])

Example #13

Show file

File:basic_neural_network.pyProject:Obipls/LfD

def NNclassify(X_train,X_test,y_train,y_test,inputtype):classtype="gender"max_words=10000batch_size=32nb_epoch=20if inputtype=='categorical':nb_epoch=10classtype="age"print('Loading data...')print(len(X_train), 'train instances')print(len(X_test), 'test instances')nb_classes = np.max(y_train)+1print(nb_classes, 'classes')print('Vectorizing sequence data...')tokenizer = Tokenizer(nb_words=max_words)X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')print('X_train shape:', X_train.shape)print('X_test shape:', X_test.shape)print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')Y_train = np_utils.to_categorical(y_train, nb_classes)Y_test = np_utils.to_categorical(y_test, nb_classes)print('Y_train shape:', Y_train.shape)print('Y_test shape:', Y_test.shape)print('Building model...')model = Sequential()model.add(MaxoutDense(100, input_shape=(max_words,)))model.add(Dropout(0.7))model.add(Dense(nb_classes,init='uniform'))model.add(Activation('softmax'))model.compile(loss='categorical_crossentropy', optimizer='adam',class_mode=inputtype)history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1)score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)print('Test score:', score[0])print('Test accuracy:', score[1])prediction=model.predict(X_test, batch_size=batch_size, verbose=1)pred_classes = np.argmax(prediction, axis=1)print(Counter(pred_classes))results=open('results.txt', 'a')results.write("{} \t {} features \t {} epochs \t {} batch size \t {} accuracy \n".format(classtype, max_words, nb_epoch, batch_size,score[1]))results.close()return pred_classes

Example #14

Show file

def get_data(mode='one_hot'): """从指定文件中获得待训练数据，数据源文件是txt文件以', '分割 PARA: filename：数据源文件 mode：返回值的类型，有one_hot与sequence两种 RETURN: 分割好的训练集、测验集 """ from sklearn.model_selection import train_test_split from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical import pandas as pd import numpy as np import json print("getting data......") columns = ['content', 'label'] content, label = [], [] with open( 'D:/instruments_generate/biLstmWithAttention/data/traffic/train.json', mode='r', encoding='utf8') as fp: for line in fp.readlines(): try: data_dict = json.loads(line) content.append(data_dict['charge'] + data_dict['defense'] + data_dict['support']) label.append(seq2lab(data_dict['result'])) except: pass label = to_categorical(np.array(label)) MAX_LEN = 500 train_data, test_data, train_label, test_label = train_test_split( content, label, test_size=0.1, random_state=42) tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ") tokenizer.fit_on_texts(content) vocab = tokenizer.word_index train_data_ids = tokenizer.texts_to_sequences(train_data) test_data_ids = tokenizer.texts_to_sequences(test_data) if mode == 'one_hot': train_data = tokenizer.sequences_to_matrix(train_data_ids, mode='binary') test_data = tokenizer.sequences_to_matrix(test_data_ids, mode='binary') elif mode == 'sequence': train_data = pad_sequences(train_data_ids, maxlen=MAX_LEN) test_data = pad_sequences(test_data_ids, maxlen=MAX_LEN) print("data getted") return train_data, test_data, train_label, test_label, vocab

Example #15

Show file

File:sentiment_main.pyProject:Sra1chandra/HandsonNeuralNetwork_keras

def bag_of_words(): print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') num_classes = np.max(y_train) + 1 print(num_classes, 'classes') max_words = 1000 print('Vectorizing sequence data...') tokenizer = Tokenizer(num_words=max_words) train = tokenizer.sequences_to_matrix(x_train, mode='count') test = tokenizer.sequences_to_matrix(x_test, mode='count') print('x_train shape:', train.shape) print('x_test shape:', test.shape) classify(train, y_train, test, y_test)

Example #16

Show file

def quick_dtmize(train_text, test_text, vocab_limit, mode='count'): '''vectorize docs w keras Tokenizer API properly with one function call''' assert mode in ['binary', 'count', 'freq', 'tfidf'], 'supplied `mode` invalid!' tokenizer = Tokenizer(num_words=vocab_limit) tokenizer.fit_on_texts(train_text) train_intseqs = tokenizer.texts_to_sequences(train_text) test_intseqs = tokenizer.texts_to_sequences(test_text) train_x = tokenizer.sequences_to_matrix(train_intseqs, mode=mode) test_x = tokenizer.sequences_to_matrix(test_intseqs, mode=mode) return train_x, test_x, tokenizer.word_index

Example #17

Show file

def load_data(em0, em1, em2, em3, em4, em5, em6): # em0,em1,em2,em3,em4,em5,em6=Domain() # words=createVocablulary(em0+em1+em2+em3+em4+em5+em6) test = em0[:72] + em1[: 72] + em2[: 72] + em3[: 72] + em4[: 72] + em5[: 72] + em6[: 72] # 前90(18)个作为测试样本 val = em0[-30:] + em1[-30:] + em2[-30:] + em3[-30:] + em4[-30:] + em5[ -30:] + em6[-30:] # 验证集 后60个 em2_new = [] for i in range(360 - len(em2)): em2.append(random.choice(em2[72:-30])) # re-sampling 扩充到 360 train=random.sample(em0[72:-30],258)+random.sample(em1[72:-30],258)+random.sample(em2[72:-30],258)+\ random.sample(em3[72:-30],258)+random.sample(em4[72:-30],258)+random.sample(em5[72:-30],258)+\ random.sample(em6[72:-30],258) words = createVocablulary(train + val) train_vec, train_label = createIndex(words, train) val_vec, val_label = createIndex(words, val) test_vec, test_label = createIndex(words, test) X_train = process(train_vec, nb_words=max_features) X_val = process(val_vec, nb_words=max_features) X_test = process(test_vec, nb_words=max_features) # X_train=np.array(X_train) # X_val=np.array(X_val) # X_test=np.array(X_test) print 'X_train:', len(X_train) print len(X_train[0]) print X_train[0] # print X_train[0] # print len(X_test) tokenizer = Tokenizer(nb_words=max_features) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') print len(X_train[0]) print X_train[0][:200] print X_train.shape X_val = tokenizer.sequences_to_matrix(X_val, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') X_train = np.reshape(X_train, (258 * 7, 1, max_features)) X_val = np.reshape(X_val, (30 * 7, 1, max_features)) X_test = np.reshape(X_test, (72 * 7, 1, max_features)) Y_train = np_utils.to_categorical(train_label, nb_classes) Y_test = np_utils.to_categorical(test_label, nb_classes) Y_val = np_utils.to_categorical(val_label, nb_classes) return X_train, X_test, Y_train, Y_test, X_val, Y_val, test_label

Example #18

Show file

def TokenTestGen(parentpath, filename, encoding='gbk'): from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical dataGen = csvToTextGen(parentpath=parentpath, filename=filename, encoding=encoding) labelList, maxSegLen = csvToLabelAndDataMaxLen(parentpath=parentpath, filename=filename, encoding=encoding) # tokenizer = Tokenizer(num_words=VOCAB_SIZE) tokenizer = Tokenizer() tokenizer.fit_on_texts(dataGen) dataGenList = csvToTextGen(parentpath=parentpath, filename=filename, encoding=encoding) sequences = tokenizer.texts_to_sequences(dataGenList) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) if modelname == 'mlp': data = tokenizer.sequences_to_matrix(sequences, mode='tfidf') data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(labelList, num_classes=LABEL_CLASS) # print("data:",data) # print("labels:",labels) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) return maxSegLen, word_index, labels, data

Example #19

Show file

File:dl_models.pyProject:sviatoplok/apis-core

def test_model(model, sent): K.clear_session() script_dir = os.path.dirname(os.path.realpath('__file__')) fileh = open( os.path.join(script_dir, 'data/nlp_models/{}_vocab.obj'.format(model)), 'rb') lst_orth, lst_orth_dict, lst_labels, lst_labels_dict, lst_zero_label, lst_labels_dhae2 = pickle.load( fileh) model = load_model( os.path.join(script_dir, 'data/nlp_models/{}.h5'.format(model))) result = [] txt = nlp(sent) tokens_lst = [] for ent in txt.ents: print(ent) tokens, lemmas, pos_tags, shapes = extract_verbs_from_entity( ent, lst_orth, lst_orth_dict, add=False) if len(tokens) > 0: tokens_lst.append(tokens) x_matrix2 = np.array(tokens_lst) print(x_matrix2) tokenizer = Tokenizer(num_words=len(lst_orth)) x_matrix3 = tokenizer.sequences_to_matrix(tokens_lst, mode='binary') zz = model.predict(x_matrix3, batch_size=32, verbose=1) for idx1, z in enumerate(zz): for idx, x in enumerate(zz[idx1]): v_id = '-' for k in lst_labels_dict.keys(): if lst_labels_dict[k] == idx: v_id = VocabsBaseClass.objects.get(id=k).name result.append((str(txt.ents[idx1]), idx, v_id, x)) return result

Example #20

Show file

def get_data( filename='D:/judgement_prediction/judgement_prediction/temp/data.txt', mode='one_hot'): """从指定文件中获得待训练数据，数据源文件是txt文件以', '分割 PARA: filename：数据源文件 mode：返回值的类型，有one_hot与sequence两种 RETURN: 分割好的训练集、测验集 """ from sklearn.model_selection import train_test_split from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical import pandas as pd import numpy as np print("getting data......") columns = ['content', 'label'] data = pd.read_csv(filename, encoding='utf-8', sep=', ', header=None, names=columns, engine='python') data.reindex(np.random.permutation(data.index)) content = data['content'] label = to_categorical(np.array(data['label'])) MAX_LEN = 200 train_data, test_data, train_label, test_label = train_test_split( content, label, test_size=0.1, random_state=42) tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ") tokenizer.fit_on_texts(content) vocab = tokenizer.word_index train_data_ids = tokenizer.texts_to_sequences(train_data) test_data_ids = tokenizer.texts_to_sequences(test_data) if mode == 'one_hot': train_data = tokenizer.sequences_to_matrix(train_data_ids, mode='binary') test_data = tokenizer.sequences_to_matrix(test_data_ids, mode='binary') elif mode == 'sequence': train_data = pad_sequences(train_data_ids, maxlen=MAX_LEN) test_data = pad_sequences(test_data_ids, maxlen=MAX_LEN) print("data getted") return train_data, test_data, train_label, test_label, vocab

Example #21

Show file

File:parameter_search.pyProject:mydp2017/RIDDLE

def preproc_for_sklearn(X, y, nb_features): try: tokenizer = Tokenizer(num_words=nb_features) except: tokenizer = Tokenizer(num_words=nb_features) X = tokenizer.sequences_to_matrix(X, mode='binary') return X, y

Example #22

Show file

File:parameter_search.pyProject:agoila/RIDDLE

def preproc_for_sklearn(X, y, nb_features): try: tokenizer = Tokenizer(num_words=nb_features) except: tokenizer = Tokenizer(num_words=nb_features) X = tokenizer.sequences_to_matrix(X, mode='binary') return X, y

Example #23

Show file

def load_vect_mat(): """The main method for loading, vectorizing, matrix-forming the newswire (labeled train & test) datato be fed into the Keras functional model API .fit and .evaluate functions.Arguments---------noneReturns-------ttPair -- The usual pair of (X, Y)-train and that for test (tuple/pair of tuples/pairs)""" print('\nLoading data...') (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2) print(len(X_train), 'train sequences Be like:') print(X_train[0]) print(len(X_test), 'test sequences Be like:') print(X_test[0]) global nb_classes nb_classes = np.max(y_train) + 1 print(nb_classes, 'topic classes') print('\nVectorizing (1/0) sequence data...') tokenizer = Tokenizer(nb_words=max_words) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print( '\nConvert the list of (integer) class labels to one hotshot! -- 1/0 "row-wise" topic matrix (for use with categorical_crossentropy)' ) Y_train = np_utils.to_categorical(y_train, nb_classes) print(y_train[0], ' --> ', Y_train[0]) print('... --> ...') print(y_train[-1], ' --> ', Y_train[-1]) Y_test = np_utils.to_categorical(y_test, nb_classes) print('Y_train shape:', Y_train.shape) print('Y_test shape:', Y_test.shape) ttPair = ((X_train, Y_train), (X_test, Y_test)) return ttPair

Example #24

Show file

def runExperiment(xTrain, yTrain, xTest, yTest, outFile): numClasses = np.max(yTrain) + 1 tokenizer = Tokenizer(num_words=MAXWORDS) xTrain = tokenizer.sequences_to_matrix(xTrain, mode='binary') xTest = tokenizer.sequences_to_matrix(xTest, mode='binary') yTrain = keras.utils.to_categorical(yTrain, numClasses) yTest = keras.utils.to_categorical(yTest, numClasses) model = Sequential() model.add(Dense(HIDDENLAYER1, input_shape=(MAXWORDS, ))) model.add(Activation('relu')) model.add(Dense(HIDDENLAYER2)) model.add(Activation('sigmoid')) model.add(Dropout(0.5)) model.add(Dense(numClasses)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(xTrain, yTrain, batch_size=BATCHSIZE, epochs=EPOCHS, verbose=VERBOSE, validation_split=VALIDATIONSPLIT) predictions = model.predict(xTest, batch_size=BATCHSIZE, verbose=VERBOSE) labelsN = [] predictionsN = [] for i in range(0, len(predictions)): maxJ = -1 maxP = 0 for j in range(0, len(predictions[i])): if predictions[i][j] > maxP: maxP = predictions[i][j] maxJ = j maxYJ = -1 maxY = 0 for j in range(0, len(yTest[i])): if yTest[i][j] > maxY: maxY = yTest[i][j] maxYJ = j labelsN.append(maxJ) predictionsN.append(maxYJ) print(maxYJ, maxJ, file=outFile) score = metrics.accuracy_score(labelsN, predictionsN) return (score, labelsN, predictionsN)

Example #25

Show file

File:train_model.pyProject:venkattrj/keras-simple-sentiment-analysis

def train_model(): max_words = 500 data = pd.read_csv("data.csv", sep='\t', skipinitialspace=True) train_x = [x[1] for x in data.values[:1000]] # index all the sentiment labels train_y = np.asarray([x[0] for x in data.values[:1000]]) tokenizer = Tokenizer(num_words=max_words) # feed tweets to the Tokenizer tokenizer.fit_on_texts(train_x) # Tokenizers come with a convenient list of words and IDs dictionary = tokenizer.word_index # Let's save this out so we can use it later with open('dictionary1.json', 'w') as dictionary_file: json.dump(dictionary, dictionary_file) allWordIndices = [] # for each tweet, change each token to its ID in the Tokenizer's word_index for text in train_x: wordIndices = convert_text_to_index_array(text, dictionary) allWordIndices.append(wordIndices) # now we have a list of all tweets converted to index arrays. # cast as an array for future usage. allWordIndices = np.asarray(allWordIndices) # create one-hot matrices out of the indexed tweets train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary') # treat the labels as categories train_y = keras.utils.to_categorical(train_y, 2) model = Sequential() model.add(Dense(512, input_shape=(max_words, ), activation='relu')) model.add(Dropout(0.5)) model.add(Dense(256, activation='sigmoid')) model.add(Dropout(0.5)) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(train_x, train_y, batch_size=32, epochs=5, verbose=1, validation_split=0.1, shuffle=True) model_json = model.to_json() with open('model1.json', 'w') as json_file: json_file.write(model_json) model.save_weights('model1.h5')

Example #26

Show file

def processData(): """ Pre-process the Reuters data. """ (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2) # Tokenize the data tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') x_test = tokenizer.sequences_to_matrix(x_test, mode='binary') # Convert class vector to binary class matrix num_classes = np.max(y_train) + 1 y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) return x_train, y_train, x_test, y_test, num_classes

Example #27

Show file

File:data_helper.pyProject:dreasine/Data-Grand-Cup-Competition

def preprocess_keras(): # 划分训练/测试集 train = pd.read_csv("data/long_train.csv") new_train = train.rename(columns={'class': 'article_class'}, inplace=False) #y_train = pd.get_dummies(new_train['article_class']) y = new_train.article_class.values x_text = new_train.word_seg.values X_train, X_test, y_train, y_test = train_test_split(x_text, y, test_size=0.1, random_state=42) # 对类别变量进行编码，共10类 y_train = pd.Series(y_train) y_test = pd.Series(y_test) y_labels = list(y_train.value_counts().index) le = pr.LabelEncoder() le.fit(y_labels) num_labels = len(y_labels) y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]), num_labels) y_test = to_categorical(y_test.map(lambda x: le.transform([x])[0]), num_labels) # 分词，构建单词-id词典 tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ") tokenizer.fit_on_texts(x_text) vocab = tokenizer.word_index # 将每个词用词典中的数值代替 X_train_word_ids = tokenizer.texts_to_sequences(X_train) X_test_word_ids = tokenizer.texts_to_sequences(X_test) # One-hot x_train_o = tokenizer.sequences_to_matrix(X_train_word_ids, mode='binary') x_test_o = tokenizer.sequences_to_matrix(X_test_word_ids, mode='binary') # 序列模式 x_train_p = pad_sequences(X_train_word_ids, maxlen=20) x_test_p = pad_sequences(X_test_word_ids, maxlen=20) return x_train_o, y_train, vocab, x_test_o, y_test

Example #28

Show file

 def process(self, json_filename=None, h5_filename=None, plot=False, epochs=100): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] All = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') for r in csv_file: if i != 0: All.append(r) i = i + 1 print("len(All): ", len(All)) #randoming shuffle all tweets np.random.shuffle(All) ones_count = 0 for r in All: tweet = r[0].strip() label = int(r[1]) X_all.append(tweet) Y_all.append(label) print("Data Ingested") print("X_all[0]: ", X_all[0]) tokenizer = Tokenizer(num_words=max_words, oov_token='unk') print("Fitting data") tokenizer.fit_on_texts(X_all) X_Seq_All = tokenizer.texts_to_sequences(X_all) print("X_Seq_All[0]", X_Seq_All[0]) print("Final Conversion") X_Train = tokenizer.sequences_to_matrix(X_Seq_All, mode='binary') print("train_x[0]", X_Train[0]) Y_Train = Y_all print("Create Model") model = Sequential() model.add(Dense(1, input_dim=10000)) model.add(Activation('sigmoid')) model.summary() print("Compilation") model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) history = model.fit(X_Train, Y_Train, epochs=epochs, validation_split=0.20) print("Done")

Example #29

Show file

File:classify_example.pyProject:thepsiwa/machine-learning

def preprocessing(X_train, X_test, Y_train, Y_test, num_classes): print('Before convert of sequence words to binary matrix...') print('X_train shape:', np.shape(X_train)) print('X_test shape:', np.shape(X_test)) print('Convert sequences of words (index) to binary matrix') tokenizer = Tokenizer(num_words=MAX_WORDS) # Return: numpy array of shape (len(sequences), num_words). X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Convert class label (integers vector) to binary class matrix') Y_train = keras.utils.to_categorical(Y_train, num_classes) Y_test = keras.utils.to_categorical(Y_test, num_classes) print('Y_train shape:', Y_train.shape) print('Y_test shape:', Y_test.shape) return X_train, X_test, Y_train, Y_test

Example #30

Show file

def get_reuters_dataset(batch_size, max_words): (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2) nb_classes = np.max(y_train) + 1 tokenizer = Tokenizer(nb_words=max_words) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') y_train = np_utils.to_categorical(y_train, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) batch_iterator = SimpleBatchIterator(X_train, y_train, batch_size, autoloop=True) test_batch_iterator = SimpleBatchIterator(X_test, y_test, len(X_test), autoloop=True) return batch_iterator, test_batch_iterator, nb_classes

Example #31

Show file

File:dataRelate.pyProject:guoxiaobo96/judgement_prediction

 def __split_data(self,mode,MAX_LEN): from sklearn.model_selection import train_test_split from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences content, label=self.__read_data() train_data, test_data, train_label, test_label = train_test_split(content, label, test_size=0.1, random_state=42) tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ") tokenizer.fit_on_texts(content) vocab = tokenizer.word_index train_data_ids = tokenizer.texts_to_sequences(train_data) test_data_ids = tokenizer.texts_to_sequences(test_data) if mode=='one_hot': train_data = tokenizer.sequences_to_matrix(train_data_ids, mode='binary') test_data = tokenizer.sequences_to_matrix(test_data_ids, mode='binary') elif mode=='sequence': train_data = pad_sequences(train_data_ids, maxlen=MAX_LEN) test_data = pad_sequences(test_data_ids, maxlen=MAX_LEN) return train_data, test_data, train_label, test_label, vocab

Example #32

Show file

 def preprocess(self, X_train, y_train, X_val, y_val): X_train_headline, X_train_article = X_train X_val_headline, X_val_article = X_val if self.get_tokenizer() is None: tokenizer = Tokenizer(num_words=self.config['vocabulary_dim']) self.set_tokenizer(tokenizer) tokenizer = self.get_tokenizer() tokenizer.fit_on_texts(X_train_headline + X_train_article) X_train_headline = tokenizer.texts_to_sequences(X_train_headline) X_train_article = tokenizer.texts_to_sequences(X_train_article) X_val_headline = tokenizer.texts_to_sequences(X_val_headline) X_val_article = tokenizer.texts_to_sequences(X_val_article) X_train_headline = tokenizer.sequences_to_matrix( X_train_headline, mode=self.config['matrix_mode']) X_train_article = tokenizer.sequences_to_matrix( X_train_article, mode=self.config['matrix_mode']) X_val_headline = tokenizer.sequences_to_matrix( X_val_headline, mode=self.config['matrix_mode']) X_val_article = tokenizer.sequences_to_matrix( X_val_article, mode=self.config['matrix_mode']) y_train_stance = np_utils.to_categorical(y_train) y_train_related = np_utils.to_categorical(collapse_stances(y_train)) y_val_stance = np_utils.to_categorical(y_val) y_val_related = np_utils.to_categorical(collapse_stances(y_val)) return ({ 'headline_input': X_train_headline, 'article_input': X_train_article, }, { 'related_prediction': y_train_related, 'stance_prediction': y_train_stance, }, { 'headline_input': X_val_headline, 'article_input': X_val_article, }, { 'related_prediction': y_val_related, 'stance_prediction': y_val_stance, })

Example #33

Show file

File:conversion-prediction.pyProject:brandonassing/capstone

def tokenize(dic, data): # create a tokenizer and feed in word index t = Tokenizer(num_words=None, lower=True, split=' ') t.word_index = dic # convert words from each call transcription into an index array allWords = [] transcriptions = data['Words'] for text in transcriptions: words = convert_text_to_index_array(text, dic) allWords.append(words) # convert index array into a matrix and return it return t.sequences_to_matrix(allWords, mode='binary')

Example #34

Show file

File:stanford_sentiment_treebank.pyProject:tttthomasssss/hackathon

def run_keras_example():max_words = 1000batch_size = 32nb_epoch = 5print('Loading data...')(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2)print(len(X_train), 'train sequences')print(len(X_test), 'test sequences')nb_classes = np.max(y_train)+1print(nb_classes, 'classes')print('Vectorizing sequence data...')tokenizer = Tokenizer(nb_words=max_words)X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')print('X_train shape:', X_train.shape)print('X_test shape:', X_test.shape)print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')Y_train = np_utils.to_categorical(y_train, nb_classes)Y_test = np_utils.to_categorical(y_test, nb_classes)print('Y_train shape:', Y_train.shape)print('Y_test shape:', Y_test.shape)print('Building model...')model = Sequential()model.add(Dense(512, input_shape=(max_words,)))model.add(Activation('tanh'))model.add(Dropout(0.5))model.add(Dense(nb_classes))model.add(Activation('softmax'))model.compile(loss='categorical_crossentropy', optimizer='adam')history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1)score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)print('Test score:', score[0])print('Test accuracy:', score[1])

Example #35

Show file

File:reuters_mlp.pyProject:GingerHugo/keras

'''max_words = 10000batch_size = 16print "Loading data..."(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2)print len(X_train), 'train sequences'print len(X_test), 'test sequences'nb_classes = np.max(y_train)+1print nb_classes, 'classes'print "Vectorizing sequence data..."tokenizer = Tokenizer(nb_words=max_words)X_train = tokenizer.sequences_to_matrix(X_train, mode="binary")X_test = tokenizer.sequences_to_matrix(X_test, mode="binary")print 'X_train shape:', X_train.shapeprint 'X_test shape:', X_test.shapeprint "Convert class vector to binary class matrix (for use with categorical_crossentropy)"Y_train = np_utils.to_categorical(y_train, nb_classes)Y_test = np_utils.to_categorical(y_test, nb_classes)print 'Y_train shape:', Y_train.shapeprint 'Y_test shape:', Y_test.shapeprint "Building model..."model = Sequential()model.add(Dense(max_words, 256, init='normal'))model.add(Activation('relu'))model.add(BatchNormalization(input_shape=(256,))) # try without batch normalization (doesn't work as well!)

Example #36

Show file

File:makeModel.pyProject:Anchal-kansal/sentiment-analysis

 # is make all texts the same length -- in this case, the length # of the longest text in the set. return [dictionary[word] for word in kpt.text_to_word_sequence(text)]allWordIndices = []# for each tweet, change each token to its ID in the Tokenizer's word_indexfor text in train_x: wordIndices = convert_text_to_index_array(text) allWordIndices.append(wordIndices)# now we have a list of all tweets converted to index arrays.# cast as an array for future usage.allWordIndices = np.asarray(allWordIndices)# create one-hot matrices out of the indexed tweetstrain_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')# treat the labels as categoriestrain_y = keras.utils.to_categorical(train_y, 2)from keras.models import Sequentialfrom keras.layers import Dense, Dropout, Activationmodel = Sequential()model.add(Dense(512, input_shape=(max_words,), activation='relu'))model.add(Dropout(0.5))model.add(Dense(256, activation='sigmoid'))model.add(Dropout(0.5))model.add(Dense(2, activation='softmax'))model.compile(loss='categorical_crossentropy', optimizer='adam',

Example #37

Show file

File:keras_jupyter.pyProject:oliverlewis/datasciencecoursera

print (y_train_cat.shape, y_test_cat.shape)# In[8]:nb_classes = np.max(encoded_Y_train)+1print(nb_classes, 'classes')# In[9]:tokenizer = Tokenizer()tokenizer.fit_on_texts(X_train)X_train_num = tokenizer.texts_to_sequences(X_train)X_test_num = tokenizer.texts_to_sequences(X_test)X_train_mat = tokenizer.sequences_to_matrix(X_train_num)X_test_mat = tokenizer.sequences_to_matrix(X_test_num)# In[10]:print('X_train shape:', X_train_mat.shape)print('X_test shape:', X_test_mat.shape)# In[36]:batch_size = 100nb_epoch = 50

Example #38

Show file

File:loadModel.pyProject:Anchal-kansal/sentiment-analysis

 for word in words: if word in dictionary: wordIndices.append(dictionary[word]) else: print("'%s' not in training corpus; ignoring." %(word)) return wordIndices# read in your saved model structurejson_file = open('model.json', 'r')loaded_model_json = json_file.read()json_file.close()# and create a model from thatmodel = model_from_json(loaded_model_json)# and weight your nodes with your saved valuesmodel.load_weights('model.h5')# okay here's the interactive partwhile 1: evalSentence = raw_input('Input a sentence to be evaluated, or Enter to quit: ') if len(evalSentence) == 0: break # format your input for the neural net testArr = convert_text_to_index_array(evalSentence) input = tokenizer.sequences_to_matrix([testArr], mode='binary') # predict which bucket your input belongs in pred = model.predict(input) # and print it for the humons print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))

Example #39

Show file

File:stanford_sentiment_treebank.pyProject:tttthomasssss/hackathon

y_train, y_valid, y_test = data[1], data[3], data[5]#X_train, X_valid, X_test = data[0], data[2], data[4]vec = CountVectorizer()X_train = vec.fit_transform([' '.join(l) for l in data[0]])X_valid = vec.transform([' '.join(l) for l in data[2]])X_test = vec.transform([' '.join(l) for l in data[4]])tokenizer = Tokenizer()tokenizer.fit_on_texts([' '.join(l) for l in data[0]])X_train_keras = tokenizer.texts_to_sequences([' '.join(l) for l in data[0]])X_test_keras = tokenizer.texts_to_sequences([' '.join(l) for l in data[4]])X_valid_keras = tokenizer.texts_to_sequences([' '.join(l) for l in data[2]])X_train_keras = tokenizer.sequences_to_matrix(X_train_keras)X_test_keras = tokenizer.sequences_to_matrix(X_test_keras)X_valid_keras = tokenizer.sequences_to_matrix(X_valid_keras)n_classes = np.max(y_train) + 1Y_train = np_utils.to_categorical(y_train, n_classes)Y_test = np_utils.to_categorical(y_test, n_classes)Y_valid = np_utils.to_categorical(y_valid, n_classes)print('KERAS...')### MLPmodel = Sequential()model.add(Dense(output_dim=2048, input_dim=X_test_keras.shape[1], init='glorot_normal', W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))model.add(Activation('tanh'))model.add(Dense(output_dim=256, input_dim=2048, init='glorot_normal', W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))

Example #40

Show file

File:lstm_classifier.pyProject:rolfkuipers/LTP

print('Fitting text on tokenizer...')tokenizer = Tokenizer(nb_words=max_words)tokenizer.fit_on_texts(X)# Split the dataprint('Split text into train and test...')split_point = int(len(X) * 0.90)X_train, X_test = X[:split_point], X[split_point:]y_train, y_test = y[:split_point], y[split_point:]print('Text to sequence - sequence to matrix for data ...')X_train = tokenizer.texts_to_sequences(X_train)X_test = tokenizer.texts_to_sequences(X_test)X_train = tokenizer.sequences_to_matrix(X_train)X_test = tokenizer.sequences_to_matrix(X_test)nb_classes = np.max(y_train)+1y_train = np_utils.to_categorical(y_train, nb_classes)y_test = np_utils.to_categorical(y_test, nb_classes)# Pad input sequencesinput_size = len(max(X_train, key=len))X_train = sequence.pad_sequences(X_train, maxlen=input_size)X_test = sequence.pad_sequences(X_test, maxlen=input_size)# Setting some parametersbatch_size = 20

Example #41

Show file

File:LSTM_RNN.pyProject:rodriggs/box-office

end_time = time.time()average_time_per_epoch = (end_time - start_time) / epochsprint("avg sec per epoch:", average_time_per_epoch)# run simple linear regression to compare performance#based on grid search done by:#https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch08/ch08.ipynb#the tfidf vectors capture co-occurance statistics, think of each number representing how many times#a word occured in a text and scaled by word frequencytfidfTokenizer = Tokenizer(nb_words=max_features)tfidfTokenizer.fit_on_sequences(X_train.tolist())X_train_tfidf = np.asarray(tfidfTokenizer.sequences_to_matrix(X_train.tolist(), mode="tfidf"))X_test_tfidf = np.asarray(tfidfTokenizer.sequences_to_matrix(X_test.tolist(), mode="tfidf"))#check tfidf matrixprint(X_train_tfidf)print(X_train_tfidf.shape, X_test_tfidf.shape)from sklearn.linear_model import LogisticRegressionmodel_tfidf_reg = LogisticRegression(random_state=0, C=0.001, penalty='l2', verbose=1)model_tfidf_reg.fit(X_train_tfidf, y_train)from sklearn.metrics import accuracy_score#calculate test and train accuracyprint("train acc:", accuracy_score(y_test, model_tfidf_reg.predict(X_train_tfidf)))print("test acc:", accuracy_score(y_test, model_tfidf_reg.predict(X_test_tfidf)))

Example #42

Show file

File:MLP_Movie_model_code.pyProject:dkyol/deepLearning-

# In[22]:print(x_train[0])print(y_train[0])# ## 3. One-hot encoding the output# Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1.# In[23]:# One-hot encoding the output into vector mode, each of length 1000tokenizer = Tokenizer(num_words=1000)x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')print(x_train[0])# And we'll also one-hot encode the output.# In[24]:# One-hot encoding the outputnum_classes = 2y_train = keras.utils.to_categorical(y_train, num_classes)y_test = keras.utils.to_categorical(y_test, num_classes)print(y_train.shape)print(y_test.shape)

Example #43

Show file

File:deep_mlp.pyProject:agoila/RIDDLE

def process_X_data(X, nb_features): assert nb_features > 0 tokenizer = Tokenizer(num_words=nb_features) return tokenizer.sequences_to_matrix(X, mode='binary')

Python Tokenizer.sequences_to_matrix Examples, keras.preprocessing.text.Tokenizer.sequences_to_matrix Python Examples (2024)