0
0 комментариев

Здравствуйте, вопрос по нейронным сетям. Имеется модель, для классификации текста(хороший отзыв или плохой). Точность на выходе не 100%, значит есть тексты, которые классифицируются сетью неверно. Пытаюсь вывести те самые тексты, но получаю ошибку на выходе. Подскажите как поправить.

pos_train_data = pd.read_csv('train_pos.tsv',sep = '\t')
neg_train_data = pd.read_csv('train_neg.tsv',sep = '\t')
pos_test_data = pd.read_csv('test_pos.tsv',sep = '\t')
neg_test_data = pd.read_csv('test_neg.tsv',sep = '\t')
 
pos_train_data = pos_train_data[['Text','Sentiment']]
neg_train_data = neg_train_data[['Text','Sentiment']]
pos_test_data = pos_test_data[['Text','Sentiment']]
neg_test_data = neg_test_data[['Text','Sentiment']]
 
 
data_train = pd.concat([pos_train_data,neg_train_data],ignore_index = True)
data_train = data_train.sample(frac=1).reset_index(drop=True)
#print(data_train.head())
 
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=1).reset_index(drop=True)
#print(data_test.head())
 
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', punctuation)
 
def textclean(text):
    #tokens = word_tokenize(text)
    tokens = (text.lower()).split()
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if not word in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens
 
 
def review_to_words(text):
    clean_text = BeautifulSoup(text, "html5lib").get_text()
    clean_text = re.sub(r"[^a-zA-Z]", " ", clean_text)
    words = (clean_text.lower()).split()
    words = [w for w in words if w not in stopwords.words("english")]
    return words
 
 
reviews = []
for index,row in data_train.iterrows():
    text = (row['Text'].lower())
    reviews.append(textclean(text))
print(reviews[0])
 
linked_reviews = list(itertools.chain.from_iterable(reviews))
vocab_freq = dict()
#print(linked_reviews[1])
 
for word in linked_reviews:
    if word not in vocab_freq:
        vocab_freq[word] = 1
    else:
        vocab_freq[word] += 1
sorted_vocab_freq = list(reversed(sorted(vocab_freq.items(), key=operator.itemgetter(1))))
print(sorted_vocab_freq)
print(len(sorted_vocab_freq))
 
 
TOTAL_VOCAB = 5000
 
word_to_id = dict()
id_to_word = dict()
for i in range(TOTAL_VOCAB):
    word_to_id[sorted_vocab_freq[i][0]] = i
    id_to_word[i] = sorted_vocab_freq[i][0]
print(id_to_word[0])
 
#review_lengths
review_lengths = pd.DataFrame([len(review) for review in reviews])
review_lengths.columns = ['Len']
print(review_lengths)
#stats
print(review_lengths.describe())
 
 
def convert(l):
    new_l = []
    for word in l:
        if word in word_to_id:
            new_l.append(word_to_id[word])
    return new_l
#print(len(data_train['Sentiment']))
 
 
X_train = []
y_train = []
 
#Tukey's method
first_q = review_lengths.Len.quantile([0.25])[0.25]
third_q = review_lengths.Len.quantile([0.75])[0.75]
 
upper_threshold = third_q + 1.5*(third_q-first_q)
lower_threshold = first_q - 1.5*(third_q-first_q)
 
print(upper_threshold,lower_threshold)
 
 
 
for i in range(len(data_train)):
    converted_review = convert(reviews[i])
    if len(converted_review) <= upper_threshold:
        X_train.append(converted_review)
        y_train.append(data_train['Sentiment'][i])
 
 
X_train = np.array(X_train)
y_train = np.array(y_train)
print(X_train)
print(y_train)
 
X_train = sequence.pad_sequences(X_train, maxlen=int(upper_threshold),value = 0)
print(X_train.shape,y_train.shape)
 
 
 
 
data_test = pd.concat([pos_test_data,pos_test_data, neg_test_data], ignore_index=True)
data_test = data_test.sample(frac=0.3).reset_index(drop=True)
print(data_test)
print(pos_test_data)
validation_reviews = []
for index, row in data_test.iterrows():
    text = (row['Text'].lower())
    validation_reviews.append(textclean(text))
 
X_val = []
y_val = []
for i in range(len(data_test)):
    converted_review = convert(validation_reviews[i])
    if len(converted_review) <= upper_threshold:
        X_val.append(converted_review)
        y_val.append(data_test['Sentiment'][i])
X_val = np.array(X_val)
X_val = sequence.pad_sequences(X_val, maxlen=int(upper_threshold), value=0)
print(X_val)
y_val = np.array(y_val)
#print(X_train)
#print(y_train)
 
 
 
 
EMBEDDING_LEN = 32
 
model = Sequential()
model.add(Embedding(TOTAL_VOCAB,EMBEDDING_LEN,input_length = int(upper_threshold)))
model.add(Conv1D(128,3,padding = 'same'))
model.add(Conv1D(64,3,padding = 'same'))
model.add(Conv1D(32,2,padding = 'same'))
model.add(Conv1D(16,2,padding = 'same'))
model.add(Flatten())
model.add(Dropout(0.25))
model.add(Dense(100,activation = 'relu'))
model.add(Dropout(0.25))
model.add(Dense(1,activation='sigmoid'))
model.summary()
opt = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss = 'binary_crossentropy',optimizer = opt ,metrics = ['accuracy'])
model.fit(X_train,y_train,validation_data = (X_val,y_val),epochs = 1 ,batch_size = 1000)
 
 
y_pred_vect = model.predict(X_val)
# bolean mask
mask = (y_pred_vect != y_val).any(axis=1)
print(mask)
print(len(mask))
num_words=1000 # only use top 1000 words
INDEX_FROM=3   # word index offset
# этот шаг нужен чтобы получить `test_x` в изначальном виде (до токенизации):
(train_x, _), (test_x, _) = imdb.load_data(num_words=num_words, index_from=index_from)
x_wrong = test_x[mask]
 
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+index_from) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
 
id_to_word = {value:key for key,value in word_to_id.items()}
all_wrong_sents = [' '.join(id_to_word[id] for id in sent) for sent in x_wrong]

Блок с кодом для вывода текстов в конце программы.
Соответственно сама ошибка x_wrong = test_x[mask] IndexError: boolean index did not match indexed array along dimension 0; dimension is 25000 but corresponding boolean dimension is 7242


Добавить комментарий