# Select the dataset.# 'imagenet', 'train', tfr文件存储位置# TFR文件命名格式:'voc_2012_%s_*.tfrecord',%s使用train或者testdataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
def get_dataset(name, split_name, dataset_dir, file_pattern=None, reader=None): """ Returns: A `Dataset` class. Raises: ValueError: If the dataset `name` is unknown. """ if name not in datasets_map: raise ValueError('Name of dataset unknown %s' % name) # pascalvoc_2012.get_split return datasets_map[name].get_split(split_name, dataset_dir, file_pattern, reader)def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """ Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ if not file_pattern: file_pattern = FILE_PATTERN # 需要文件命名格式满足:'voc_2012_%s_*.tfrecord' return pascalvoc_common.get_split(split_name, dataset_dir, file_pattern, reader, SPLITS_TO_SIZES, # {'train': 17125,} ITEMS_TO_DESCRIPTIONS, NUM_CLASSES # 20 ) """ ITEMS_TO_DESCRIPTIONS = { 'image': 'A color image of varying height and width.', 'shape': 'Shape of the image', 'object/bbox': 'A list of bounding boxes, one per each object.', 'object/label': 'A list of labels, one per each object.', } """
def get_split(split_name, dataset_dir, file_pattern, reader, split_to_sizes, items_to_descriptions, num_classes): """Gets a dataset tuple with instructions for reading Pascal VOC dataset. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ # 'train' if split_name not in split_to_sizes: raise ValueError('split name %s was not recognized.' % split_name) file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader # Features in Pascal VOC TFRecords. keys_to_features = { # 解码TFR文件方式 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/height': tf.FixedLenFeature([1], tf.int64), 'image/width': tf.FixedLenFeature([1], tf.int64), 'image/channels': tf.FixedLenFeature([1], tf.int64), 'image/shape': tf.FixedLenFeature([3], tf.int64), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { # 解码二进制数据条目 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'shape': slim.tfexample_decoder.Tensor('image/shape'), 'object/bbox': slim.tfexample_decoder.BoundingBox( ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), 'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), 'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), } # 解码实施 decoder = slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) labels_to_names = None # tf.gfile.Exists(os.path.join(dataset_dir, 'labels.txt')) if dataset_utils.has_labels(dataset_dir): labels_to_names = dataset_utils.read_label_file(dataset_dir) # else: # labels_to_names = create_readable_names_for_imagenet_labels() # dataset_utils.write_label_file(labels_to_names, dataset_dir) return slim.dataset.Dataset( data_sources=file_pattern, # TFR文件名 reader=reader, # 阅读器 decoder=decoder, # 解码Tensor num_samples=split_to_sizes[split_name], # 数目 items_to_descriptions=items_to_descriptions, # decoder条目描述字段 num_classes=num_classes, # 类别数 labels_to_names=labels_to_names # 字典{图片:类别,……} )''' items_to_descriptions: {'image': 'A color image of varying height and width.', 'shape': 'Shape of the image', 'object/bbox': 'A list of bounding boxes, one per each object.', 'object/label': 'A list of labels, one per each object.',}'''
这里额外说一句,存储数据中ymin、xmin、ymax、xmax格子存储为(n,)的shape(n表示图像中对象数目),但是在进行了items_to_handlers之后,新的handlers:object/bbox形状变化为(n, 4),由于这涉及到多目标检测后续一系列处理,所以值得注意。
从TFR中获取 batch数据
with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, # DatasetDataProvider 需要 slim.dataset.Dataset 做参数 num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes.c # DatasetDataProvider可以通过TFR字段获取batch size数据 [image, shape, glabels, gbboxes] = provider.get(['image', 'shape', 'object/label', 'object/bbox'])
image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True)# Pre-processing image, labels and bboxes.image, glabels, gbboxes = \ image_preprocessing_fn(image, glabels, gbboxes, out_shape=ssd_shape, # (300,300) data_format=DATA_FORMAT) # 'NCHW'
def get_preprocessing(name, is_training=False): preprocessing_fn_map = { 'ssd_300_vgg': ssd_vgg_preprocessing, 'ssd_512_vgg': ssd_vgg_preprocessing, } if name not in preprocessing_fn_map: raise ValueError('Preprocessing name [%s] was not recognized' % name) def preprocessing_fn(image, labels, bboxes, out_shape, data_format='NHWC', **kwargs): return preprocessing_fn_map[name].preprocess_image( image, labels, bboxes, out_shape, data_format=data_format, is_training=is_training, **kwargs) return preprocessing_fndef preprocess_image(image, labels, bboxes, out_shape, data_format, is_training=False, **kwargs): if is_training: return preprocess_for_train(image, labels, bboxes, out_shape=out_shape, data_format=data_format) else: return preprocess_for_eval(image, labels, bboxes, out_shape=out_shape, data_format=data_format, **kwargs)
返回image, labels, bboxes
def preprocess_for_train(image, labels, bboxes, out_shape, data_format='NHWC', scope='ssd_preprocessing_train'): """Preprocesses the given image for training. """ fast_mode = False with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]): if image.get_shape().ndims != 3: raise ValueError('Input must be of size [height, width, C>0]') # Convert to float scaled [0, 1]. if image.dtype != tf.float32: image = tf.image.convert_image_dtype(image, dtype=tf.float32) tf_summary_image(image, bboxes, 'image_with_bboxes') # 上面保证了图片是3维的tf.float32格式 # (有条件的)随机裁剪,筛选调整后的labels(n,)、bboxes(n, 4),裁剪图片对应原图坐标(4,) dst_image, labels, bboxes, distort_bbox = \ distorted_bounding_box_crop(image, labels, bboxes, min_object_covered=MIN_OBJECT_COVERED, # 0.25 aspect_ratio_range=CROP_RATIO_RANGE) # (0.6, 1.67) # Resize image to output size. dst_image = tf_image.resize_image(dst_image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) tf_summary_image(dst_image, bboxes, 'image_shape_distorted') # Randomly flip the image horizontally. dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes) # Randomly distort the colors. There are 4 ways to do it. dst_image = apply_with_random_selector( dst_image, lambda x, ordering: distort_color(x, ordering, fast_mode), num_cases=4) tf_summary_image(dst_image, bboxes, 'image_color_distorted') # Rescale to VGG input scale. image = dst_image * 255. image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN]) # mean = tf.constant(means, dtype=image.dtype) # image = image - mean # Image data format. if data_format == 'NCHW': image = tf.transpose(image, perm=(2, 0, 1)) # 'NHWC' (n,) (n, 4) return image, labels, bboxes
def distorted_bounding_box_crop(image, labels, bboxes, min_object_covered=0.3, aspect_ratio_range=(0.9, 1.1), area_range=(0.1, 1.0), max_attempts=200, clip_bboxes=True, scope=None): """Generates cropped_image using a one of the bboxes randomly distorted. See `tf.image.sample_distorted_bounding_box` for more documentation. Args: image: 3-D Tensor of image (it will be converted to floats in [0, 1]). bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole image. min_object_covered: An optional `float`. Defaults to `0.1`. The cropped area of the image must contain at least this fraction of any bounding box supplied. aspect_ratio_range: An optional list of `floats`. The cropped area of the image must have an aspect ratio = width / height within this range. area_range: An optional list of `floats`. The cropped area of the image must contain a fraction of the supplied image within in this range. max_attempts: An optional `int`. Number of attempts at generating a cropped region of the image of the specified constraints. After `max_attempts` failures, return the entire image. scope: Optional scope for name_scope. Returns: A tuple, a 3-D Tensor cropped_image and the distorted bbox """ with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]): # 高级的随机裁剪 # The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width # and height of the underlying image. # 1-D, 1-D, [1, 1, 4] bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box( tf.shape(image), bounding_boxes=tf.expand_dims(bboxes, 0), # [1, n, 4] min_object_covered=min_object_covered, aspect_ratio_range=aspect_ratio_range, area_range=area_range, max_attempts=max_attempts, use_image_if_no_bounding_boxes=True) ''' Returns: A tuple of `Tensor` objects (begin, size, bboxes). begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to `tf.slice`. size: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`. Provide as input to `tf.slice`. bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box. Provide as input to `tf.image.draw_bounding_boxes`. ''' # [4] distort_bbox = distort_bbox[0, 0] # Crop the image to the specified bounding box. cropped_image = tf.slice(image, bbox_begin, bbox_size) # Restore the shape since the dynamic slice loses 3rd dimension. cropped_image.set_shape([None, None, 3]) # <-----设置了尺寸了哈 # Update bounding boxes: resize and filter out. bboxes = tfe.bboxes_resize(distort_bbox, bboxes) # [4], [n, 4] labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes, threshold=BBOX_CROP_OVERLAP, # 0.5 assign_negative=False) # 返回随机裁剪的图片,筛选调整后的labels(n,)、bboxes(n, 4),裁剪图片对应原图坐标(4,) return cropped_image, labels, bboxes, distort_bbox
tf.image.sample_distorted_bounding_box 裁剪,用法查看文档,就是裁剪一个子图,返回最后参数是子图坐标
bboxes_resize 框坐标原点置为裁剪框左上角点,xy单位长度置为裁剪框wh(归一化)
bboxes_filter_overlap 计算重叠区/原框的百分比,舍弃达不到阈值的labels和bboxes
def bboxes_resize(bbox_ref, bboxes, name=None): # Tensors inputs. with tf.name_scope(name, 'bboxes_resize'): # Translate. # bbox_ref:['ymin', 'xmin', 'ymax', 'xmax'] v = tf.stack([bbox_ref[0], bbox_ref[1], bbox_ref[0], bbox_ref[1]]) bboxes = bboxes - v # Scale. s = tf.stack([bbox_ref[2] - bbox_ref[0], # h bbox_ref[3] - bbox_ref[1], # w bbox_ref[2] - bbox_ref[0], bbox_ref[3] - bbox_ref[1]]) bboxes = bboxes / s return bboxesdef bboxes_filter_overlap(labels, bboxes, threshold=0.5, assign_negative=False, scope=None): """Filter out bounding boxes based on (relative )overlap with reference box [0, 0, 1, 1]. Remove completely bounding boxes, or assign negative labels to the one outside (useful for latter processing...). Return: labels, bboxes: Filtered (or newly assigned) elements. """ with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]): # (N,) Tensor:和[0,0,1,1]相交面积大于0的位置返回面积比(相交/原本),小于0的位置返回0 scores = bboxes_intersection(tf.constant([0, 0, 1, 1], bboxes.dtype), bboxes) mask = scores > threshold if assign_negative: # 保留所有的label和框,重叠区不够的label置负 labels = tf.where(mask, labels, -labels) # 交叉满足的标记为正,否则为负 else: # 删除重叠区不够的label和框 labels = tf.boolean_mask(labels, mask) # bool掩码,类似于array的bool切片 bboxes = tf.boolean_mask(bboxes, mask) return labels, bboxes# 被上面函数调用,计算相交(和裁剪框)面积占原框面积比值def bboxes_intersection(bbox_ref, bboxes, name=None): """Compute relative intersection between a reference box and a collection of bounding boxes. Namely, compute the quotient between intersection area and box area. Args: bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es). bboxes: (N, 4) Tensor, collection of bounding boxes. Return: (N,) Tensor with relative intersection. """ with tf.name_scope(name, 'bboxes_intersection'): # Should be more efficient to first transpose. bboxes = tf.transpose(bboxes) bbox_ref = tf.transpose(bbox_ref) # Intersection bbox and volume. int_ymin = tf.maximum(bboxes[0], bbox_ref[0]) int_xmin = tf.maximum(bboxes[1], bbox_ref[1]) int_ymax = tf.minimum(bboxes[2], bbox_ref[2]) int_xmax = tf.minimum(bboxes[3], bbox_ref[3]) h = tf.maximum(int_ymax - int_ymin, 0.) w = tf.maximum(int_xmax - int_xmin, 0.) # Volumes. inter_vol = h * w # 各个框在[0,0,1,1]内的面积 bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1]) # 各个框面积 scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection') # from tensorflow.python.ops import math_ops # 大于0的位置返回面积比,小于0的位置返回0 # tf.where(math_ops.greater(bboxes_vol, 0), # 返回bool表是否大于0 # math_ops.divide(inter_vol, bboxes_vol), # tf.zeros_like(inter_vol), name=name) return scores
with tf.device(deploy_config.inputs_device()): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, # DatasetDataProvider 需要 slim.dataset.Dataset 做参数 num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes.c # DatasetDataProvider可以通过TFR字段获取batch size数据 [image, shape, glabels, gbboxes] = provider.get(['image', 'shape', 'object/label', 'object/bbox']) # Pre-processing image, labels and bboxes. # 'CHW' (n,) (n, 4) image, glabels, gbboxes = \ image_preprocessing_fn(image, glabels, gbboxes, out_shape=ssd_shape, # (300,300) data_format=DATA_FORMAT) # 'NCHW'