I am building a classifying model to predict images over 3 classes. The data is balanced, with 10.5k images for train ( 3.5k for each ), 3k validation images ( 1k each ).
I increased my validation accuracy
at around 70%-75%
but can't go further
I used a the keras_tuner.Hyperband
to search for optimal hyperparameters, but the best I got from it is this model:
def build_model():
"""Builds a convolutional model."""
model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(80, 80, 3)),
tf.keras.layers.Rescaling(1./255)
])
# 1st - filters 1 = 48
model.add(tf.keras.layers.Conv2D(
filters=48,
kernel_size=4,
padding="same",
))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dropout(rate=0.1))
# 2nd - filters 192, kernel 4, pooling max, no dropout
model.add(tf.keras.layers.Conv2D(
filters=192,
kernel_size=4,
padding="same",
))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.ReLU())
#3rd - filters: 64, kernel - 5, maxpool dropout = 0.3
model.add(tf.keras.layers.Conv2D(
filters=256,
kernel_size=5,
padding="same",
))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dropout(rate=0.1))
# global avg pooling
model.add(tf.keras.layers.GlobalAveragePooling2D())
# flatten - no hyperparametr
model.add(tf.keras.layers.Flatten())
# dense layers: 2, NO REGULARIZERS
# 1st layer: 288 units
model.add(tf.keras.layers.Dense(units=288))
model.add(tf.keras.layers.Activation('relu'))
# 2nd dense: 192, 0.3 dropout, no regulraizers!!
model.add(tf.keras.layers.Dense(units=192))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dropout(rate=0.3))
# add last dense layer for num_classes:
model.add(tf.keras.layers.Dense(3))
model.add(tf.keras.layers.Activation('softmax'))
lr = 0.0010297
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(
optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
return model
This resulted in around 0.73 val_accuracy
, with no data_augumentation at all. The thing is that after getting to 0.7
, both training and val
were stuck there, pretty stable, not going further, not decreasing.
So I tried modifying the learning rate
to use Cosine decay
, switching to SGD
as an optimizer, and trying some data_augumentation
processes, and also making the model more complex so that the training_accuracy
will grow more.
I came up with two of what I think are closest to a better model, even though the val_accuracy
didn't increase:
For both of them, this was the cosine_decay lr scheduler
and optimizer
:
def lr_warmup_cosine_decay(
global_step,
warmup_steps,
hold=0,
total_steps=0,
start_lr=0.0,
target_lr=1e-2,
):
# Cosine decay
learning_rate = (
0.5
* target_lr
* (
1
+ ops.cos(
math.pi
* ops.convert_to_tensor(
global_step - warmup_steps - hold, dtype="float32"
)
/ ops.convert_to_tensor(
total_steps - warmup_steps - hold, dtype="float32"
)
)
)
)
warmup_lr = target_lr * (global_step / warmup_steps)
if hold > 0:
learning_rate = ops.where(
global_step > warmup_steps + hold, learning_rate, target_lr
)
learning_rate = ops.where(global_step < warmup_steps, warmup_lr, learning_rate)
return learning_rate
class WarmUpCosineDecay(schedules.LearningRateSchedule):
def __init__(self, warmup_steps, total_steps, hold, start_lr=0.0, target_lr=1e-2):
super().__init__()
self.start_lr = start_lr
self.target_lr = target_lr
self.warmup_steps = warmup_steps
self.total_steps = total_steps
self.hold = hold
def __call__(self, step):
lr = lr_warmup_cosine_decay(
global_step=step,
total_steps=self.total_steps,
warmup_steps=self.warmup_steps,
start_lr=self.start_lr,
target_lr=self.target_lr,
hold=self.hold,
)
return ops.where(step > self.total_steps, 0.0, lr)
optimizer:
total_images = 10500
total_steps = (total_images // BATCH_SIZE) * EPOCHS
warmup_steps = int(0.1 * total_steps)
hold_steps = int(0.45 * total_steps)
schedule = WarmUpCosineDecay(
start_lr=0.05,
target_lr=1e-2,
warmup_steps=warmup_steps,
total_steps=total_steps,
hold=hold_steps,
)
optimizer = tf.keras.optimizers.SGD(
weight_decay=5e-4,
learning_rate=schedule,
momentum=0.9,
)
1. No data augumentation
, one more dense layer
and no kernel_regularization
same model just repeat the 1st dense layer, with no dropout
# 0.73 val_accuracy, while training_accuracy went up to 0.9 for 40epochs
2. No data augumentation
, one more dense layer
and kernel_regularization=L2(0.01)
same as the first, just added kernel_regularization=L2(0.01)
to the dense layers
# around 0.75 val_accuracy, while training_accuracy went up to 0.98 for 40epochs
So I saw this as an overfit
and try to tweak the training data via some data_augmentation
processes. At first I overshoot it, with too much augments, like this:
# simple random flip
random_flip = keras_cv.layers.RandomFlip()
augmenters = [random_flip]
# crop + resize
crop_and_resize = keras_cv.layers.RandomCropAndResize(
target_size=(IMAGE_SIZE, IMAGE_SIZE),
crop_area_factor=(0.8, 1.0),
aspect_ratio_factor=(0.9, 1.1),
)
augmenters += [crop_and_resize]
# adding random augumentations
rand_augment = keras_cv.layers.RandAugment(
augmentations_per_image=3,
value_range=(0, 255),
magnitude=0.3,
magnitude_stddev=0.2,
rate=0.7,
)
augmenters += [rand_augment]
# adding random choice between cutmix and mixup
cut_mix = keras_cv.layers.CutMix()
mix_up = keras_cv.layers.MixUp()
cut_mix_or_mix_up = keras_cv.layers.RandomChoice([cut_mix, mix_up], batchwise=True)
augmenters += [cut_mix_or_mix_up]
But this resulted in low training_accuracy
and jumping val_accuracy
. Then I tried just two simple augments, like random_flip
and random_zoom
, but this again resulted in a very jumpy val_accuracy
and low training_accuracy
. The val_acc
will go from 0.4
to 0.5
then again 0.45
then 0.6
, totally unstable.
What should I do further? It seems like I'm somehow stuck between a stable model with maximum capacity of 0.7
, and two overfitting models with a 0.7 validation accuracy
at best
80x80 color
. I did plot the confusion Matrix for the validation data and for class0
the accuracy is about85-90
while class 1 is around70
and for class 2, which is the worst, gets a~60
accuracy $\endgroup$