49 lines
1.4 KiB
Python
49 lines
1.4 KiB
Python
|
|
import tensorflow as tf
|
|
import numpy as np
|
|
import pandas as pd
|
|
import json
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
dataframe = pd.read_csv( 'raw_data/data.csv' , encoding='iso8859' , usecols=[ 'v1' , 'v2' ] )
|
|
|
|
labels = list()
|
|
for line in dataframe.v1:
|
|
labels.append( 0 if line == 'ham' else 1 )
|
|
texts = list()
|
|
for line in dataframe.v2:
|
|
texts.append( line )
|
|
lengths = list()
|
|
for text in texts:
|
|
lengths.append( len( text.split() ) )
|
|
maxlen = max( lengths )
|
|
labels = np.array( labels )
|
|
texts = np.array( texts )
|
|
|
|
tokenizer = tf.keras.preprocessing.text.Tokenizer()
|
|
tokenizer.fit_on_texts( texts )
|
|
tokenized_messages = tokenizer.texts_to_sequences( texts )
|
|
padded_messages = tf.keras.preprocessing.sequence.pad_sequences( tokenized_messages , maxlen )
|
|
onehot_labels = tf.keras.utils.to_categorical( labels , num_classes=2 )
|
|
|
|
X = padded_messages
|
|
Y = onehot_labels
|
|
|
|
print( X.shape )
|
|
print( Y.shape )
|
|
print( 'MESSAGE MAXLEN = {}'.format( maxlen ) )
|
|
|
|
train_features , test_features ,train_labels, test_labels = train_test_split( X , Y , test_size=0.4 )
|
|
output_path = 'processed_data/'
|
|
np.save( '{}x.npy'.format( output_path ) , train_features )
|
|
np.save( '{}y.npy'.format( output_path ) , train_labels )
|
|
np.save( '{}test_x.npy'.format( output_path ) , test_features )
|
|
np.save( '{}test_y.npy'.format( output_path ) , test_labels )
|
|
|
|
with open( 'android/word_dict.json' , 'w' ) as file:
|
|
json.dump( tokenizer.word_index , file )
|
|
|
|
print( 'Data processed.')
|
|
|
|
|