-
Notifications
You must be signed in to change notification settings - Fork 4.7k
[Completed][PyTorch] Add Anchor Boxes #1312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
86b8f96
77fa230
1d89644
21ceeab
1d38f9f
a009602
e1b0943
0c8fa81
d813cc6
6ee4497
ea63293
79ce946
1db5a03
d4e493f
489baa4
f6a73c4
3ba5b0e
b81b073
35da516
000be97
9d41dae
eb80a21
5b8119f
52039d2
60c8d1f
c03ddad
fe13859
5b2cca3
1f0bc5b
bcb43da
e3bf8ec
5001890
7dc8077
f5e37a5
5ef3579
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,17 @@ np.set_printoptions(2) | |
npx.set_np() | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
%matplotlib inline | ||
from d2l import torch as d2l | ||
import torch | ||
import itertools | ||
from collections import namedtuple | ||
import math | ||
``` | ||
|
||
|
||
## Generating Multiple Anchor Boxes | ||
|
||
Assume that the input image has a height of $h$ and width of $w$. We generate anchor boxes with different shapes centered on each pixel of the image. Assume the size is $s\in (0, 1]$, the aspect ratio is $r > 0$, and the width and height of the anchor box are $ws\sqrt{r}$ and $hs/\sqrt{r}$, respectively. When the center position is given, an anchor box with known width and height is determined. | ||
|
@@ -36,9 +47,46 @@ Y = npx.multibox_prior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5]) | |
Y.shape | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
#@save | ||
def multibox_prior(feature_map_sizes, sizes, aspect_ratios): | ||
"""Compute default box sizes with scale and aspect transform.""" | ||
< 8000 span class='blob-code-inner blob-code-marker ' data-code-marker="+"> sizes = [s*728 for s in sizes] | ||
scale = feature_map_sizes | ||
steps_y = [1 / scale[0]] | ||
steps_x = [1 / scale[1]] | ||
sizes = [s / max(scale) for s in sizes] | ||
num_layers = 1 | ||
boxes = [] | ||
for i in range(num_layers): | ||
for h, w in itertools.product(range(feature_map_sizes[0]), range(feature_map_sizes[1])): | ||
cx = (w + 0.5)*steps_x[i] | ||
cy = (h + 0.5)*steps_y[i] | ||
|
||
for j in range(len(sizes)): | ||
s = sizes[j] | ||
boxes.append((cx, cy, s, s)) | ||
s = sizes[0] | ||
for ar in aspect_ratios: | ||
boxes.append((cx, cy, (s * math.sqrt(ar)), (s / math.sqrt(ar)))) | ||
return d2l.tensor(boxes) | ||
``` | ||
```{.python .input} | ||
#@tab pytorch | ||
img = d2l.plt.imread('../img/catdog.jpg') | ||
h, w = img.shape[0:2] | ||
|
||
print(h,w) | ||
X = torch.rand((1, 3, h, w)) # Construct input data | ||
Y = multibox_prior((h,w), sizes = [0.75, 0.5, 0.25], aspect_ratios = (2, 0.5)) | ||
Y.shape | ||
``` | ||
|
||
We can see that the shape of the returned anchor box variable `y` is (batch size, number of anchor boxes, 4). After changing the shape of the anchor box variable `y` to (image height, image width, number of anchor boxes centered on the same pixel, 4), we can obtain all the anchor boxes centered on a specified pixel position. In the following example, we access the first anchor box centered on (250, 250). It has four elements: the $x, y$ axis coordinates in the upper-left corner and the $x, y$ axis coordinates in the lower-right corner of the anchor box. The coordinate values of the $x$ and $y$ axis are divided by the width and height of the image, respectively, so the value range is between 0 and 1. | ||
|
||
```{.python .input n=4} | ||
#@tab all | ||
boxes = Y.reshape(h, w, 5, 4) | ||
boxes[250, 250, 0, :] | ||
``` | ||
|
@@ -59,7 +107,35 @@ def show_bboxes(axes, bboxes, labels=None, colors=None): | |
colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c']) | ||
for i, bbox in enumerate(bboxes): | ||
color = colors[i % len(colors)] | ||
rect = d2l.bbox_to_rect(bbox.asnumpy(), color) | ||
rect = d2l.bbox_to_rect(d2l.numpy(bbox), color) | ||
axes.add_patch(rect) | ||
if labels and len(labels) > i: | ||
text_color = 'k' if color == 'w' else 'w' | ||
axes.text(rect.xy[0], rect.xy[1], labels[i], | ||
va='center', ha='center', fontsize=9, color=text_color, | ||
bbox=dict(facecolor=color, lw=0)) | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
#@save | ||
def bbox_to_rect(bbox, color): | ||
"""Convert bounding box to matplotlib format.""" | ||
return d2l.plt.Rectangle(xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1], fill=False, edgecolor=color, linewidth=2) | ||
Comment on lines
+122
to
+124
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method is defined in bounding-box.md Similarly for other methods. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added it here. |
||
|
||
def show_bboxes(axes, bboxes, labels=None, colors=None): | ||
"""Show bounding boxes.""" | ||
def _make_list(obj, default_values=None): | ||
if obj is None: | ||
obj = default_values | ||
elif not isinstance(obj, (list, tuple)): | ||
obj = [obj] | ||
return obj | ||
labels = _make_list(labels) | ||
colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c']) | ||
for i, bbox in enumerate(bboxes): | ||
color = colors[i % len(colors)] | ||
rect = bbox_to_rect(d2l.numpy(bbox), color) | ||
axes.add_patch(rect) | ||
if labels and len(labels) > i: | ||
text_color = 'k' if color == 'w' else 'w' | ||
|
@@ -79,6 +155,32 @@ show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale, | |
's=0.75, r=0.5']) | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
#@save | ||
def center_2_hw(box: torch.Tensor) -> float: | ||
""" | ||
Converting (cx, cy, w, h) to (x1, y1, x2, y2) | ||
""" | ||
|
||
return torch.cat( | ||
[box[:, 0, None] - box[:, 2, None]/2, | ||
box[:, 1, None] - box[:, 3, None]/2, | ||
box[:, 0, None] + box[:, 2, None]/2, | ||
box[:, 1, None] + box[:, 3, None]/2 | ||
], dim=1) | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
d2l.set_figsize() | ||
bbox_scale = d2l.tensor((w, h, w, h)) | ||
fig = d2l.plt.imshow(img) | ||
show_bboxes(fig.axes, center_2_hw(boxes[250, 250, :, :]) * bbox_scale, | ||
['s=0.75, r=1', 's=0.5, r=1', 's=0.25, r=1', 's=0.75, r=2', | ||
's=0.75, r=0.5']) | ||
``` | ||
|
||
## Intersection over Union | ||
|
||
We just mentioned that the anchor box covers the dog in the image well. If the ground-truth bounding box of the target is known, how can "well" here be quantified? An intuitive method is to measure the similarity between anchor boxes and the ground-truth bounding box. We know that the Jaccard index can measure the similarity between two sets. Given sets $\mathcal{A}$ and $\mathcal{B}$, their Jaccard index is the size of their intersection divided by the size of their union: | ||
|
@@ -91,7 +193,48 @@ In fact, we can consider the pixel area of a bounding box as a collection of pix | |
 | ||
:label:`fig_iou` | ||
|
||
|
||
```{.python .input} | ||
#@tab pytorch | ||
#@save | ||
def intersect(box_a: torch.Tensor, box_b: torch.Tensor) -> float: | ||
# Coverting (cx, cy, w, h) to (x1, y1, x2, y2) since its easier to extract min/max coordinates | ||
temp_box_a, temp_box_b = center_2_hw(box_a), center_2_hw(box_b) | ||
|
||
# print(temp_box_a.shape) | ||
|
||
max_xy = torch.min(temp_box_a[:, None, 2:], temp_box_b[None, :, 2:]) | ||
min_xy = torch.max(temp_box_a[:, None, :2], temp_box_b[None, :, :2]) | ||
|
||
inter = torch.clamp((max_xy - min_xy), min=0) | ||
return inter[:, :, 0] * inter[:, :, 1] | ||
|
||
def box_area(box: torch.Tensor) -> float: | ||
return box[:, 2] * box[:, 3] | ||
|
||
def jaccard(box_a: torch.Tensor, box_b: torch.Tensor) -> float: | ||
# print(box_a.shape) | ||
intersection = intersect(box_a, box_b) | ||
union = box_area(box_a).unsqueeze(1) + box_area(box_b).unsqueeze(0) - intersection | ||
return intersection / union | ||
|
||
def find_overlap(bb_true_i, anchors, jaccard_overlap): | ||
|
||
jaccard_tensor = jaccard(anchors, bb_true_i) | ||
_, max_overlap = torch.max(jaccard_tensor, dim=0) | ||
|
||
overlap_list = [] | ||
for i in range(len(bb_true_i)): | ||
threshold_overlap = (jaccard_tensor[:, i] > jaccard_overlap).nonzero() | ||
|
||
if len(threshold_overlap) > 0: | ||
threshold_overlap = threshold_overlap[:, 0] | ||
overlap = torch.cat([max_overlap[i].view(1), threshold_overlap]) | ||
overlap = torch.unique(overlap) | ||
else: | ||
overlap = max_overlap[i].view(1) | ||
overlap_list.append(overlap) | ||
return overlap_list | ||
``` | ||
For the remainder of this section, we will use IoU to measure the similarity between anchor boxes and ground-truth bounding boxes, and between different anchor boxes. | ||
|
||
|
||
|
@@ -140,7 +283,26 @@ anchors = np.array([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4], | |
|
||
fig = d2l.plt.imshow(img) | ||
show_bboxes(fig.axes, ground_truth[:, 1:] * bbox_scale, ['dog', 'cat'], 'k') | ||
show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4']); | ||
show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4']) | ||
``` | ||
|
||
|
||
```{.python .input} | ||
#@tab pytorch | ||
ground_truth_class = d2l.tensor([[0], [1]]) | ||
|
||
ground_truth_bbox = d2l.tensor([[0.1, 0.08, 0.52, 0.92], | ||
[0.55, 0.2, 0.9, 0.88]]) | ||
|
||
anchors = d2l.tensor([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4], | ||
[0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8], | ||
[0.57, 0.3, 0.92, 0.9]]) | ||
|
||
fig = d2l.plt.imshow(img) | ||
|
||
show_bboxes(fig.axes, ground_truth_bbox * bbox_scale, ['dog', 'cat'], 'k') | ||
|
||
show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4']) | ||
``` | ||
|
||
We can label categories and offsets for anchor boxes by using the `multibox_target` function. This function sets the background category to 0 and increments the integer index of the target category from zero by 1 (1 for dog and 2 for cat). We add example dimensions to the anchor boxes and ground-truth bounding boxes and construct random predicted results with a shape of (batch size, number of categories including background, number of anchor boxes) by using the `expand_dims` function. | ||
|
@@ -151,9 +313,36 @@ labels = npx.multibox_target(np.expand_dims(anchors, axis=0), | |
np.zeros((1, 3, 5))) | ||
``` | ||
|
||
|
||
```{.python .input} | ||
#@tab pytorch | ||
#@save | ||
def multibox_target(class_true, bb_true, anchors): | ||
class_true +=1 | ||
class_target = torch.zeros(anchors.shape[0]).long() | ||
overlap_list = find_overlap(bb_true, anchors, 0.5) | ||
overlap_coordinates = torch.zeros_like(anchors) | ||
for j in range(len(overlap_list)): | ||
overlap = overlap_list[j] | ||
class_target[overlap] = class_true[j, 0].long() | ||
overlap_coordinates[overlap] = 1. | ||
|
||
new_anchors = torch.cat([*anchors]) | ||
overlap_coordinates = torch.cat([*overlap_coordinates]) | ||
new_anchors = new_anchors*overlap_coordinates | ||
|
||
return (new_anchors.unsqueeze(0), overlap_coordinates.unsqueeze(0), class_target.unsqueeze(0)) | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
labels = multibox_target(ground_truth_class.clone(), ground_truth_bbox, anchors) | ||
``` | ||
|
||
There are three items in the returned result, all of which are in the tensor format. The third item is represented by the category labeled for the anchor box. | ||
|
||
```{.python .input n=8} | ||
#@tab all | ||
labels[2] | ||
``` | ||
|
||
|
@@ -164,12 +353,14 @@ The second item of the return value is a mask variable, with the shape of (batch | |
Because we do not care about background detection, offsets of the negative class should not affect the target function. By multiplying by element, the 0 in the mask variable can filter out negative class offsets before calculating target function. | ||
|
||
```{.python .input n=9} | ||
#@tab all | ||
labels[1] | ||
``` | ||
|
||
The first item returned is the four offset values labeled for each anchor box, with the offsets of negative class anchor boxes labeled as 0. | ||
|
||
```{.python .input n=10} | ||
#@tab all | ||
labels[0] | ||
``` | ||
|
||
|
@@ -191,6 +382,16 @@ cls_probs = np.array([[0] * 4, # Predicted probability for background | |
[0.1, 0.2, 0.3, 0.9]]) # Predicted probability for cat | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
anchors = d2l.tensor([[0.1, 0.08, 0.52, 0.92], [0.08, 0.2, 0.56, 0.95], | ||
[0.15, 0.3, 0.62, 0.91], [0.55, 0.2, 0.9, 0.88]]) | ||
offset_preds = d2l.tensor([0] * anchors.size(0)) | ||
cls_probs = d2l.tensor([[0] * 4, # Predicted probability for background | ||
[0.9, 0.8, 0.7, 0.1], # Predicted probability for dog | ||
[0.1, 0.2, 0.3, 0.9]]) # Predicted probability for cat | ||
``` | ||
|
||
Print prediction bounding boxes and their confidence levels on the image. | ||
|
||
```{.python .input n=12} | ||
|
@@ -199,6 +400,13 @@ show_bboxes(fig.axes, anchors * bbox_scale, | |
['dog=0.9', 'dog=0.8', 'dog=0.7', 'cat=0.9']) | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
fig = d2l.plt.imshow(img) | ||
show_bboxes(fig.axes, anchors * bbox_scale, | ||
['dog=0.9', 'dog=0.8', 'dog=0.7', 'cat=0.9']) | ||
``` | ||
|
||
We use the `multibox_detection` function to perform NMS and set the threshold to 0.5. This adds an example dimension to the tensor input. We can see that the shape of the returned result is (batch size, number of anchor boxes, 6). The 6 elements of each row represent the output information for the same prediction bounding box. The first element is the predicted category index, which starts from 0 (0 is dog, 1 is cat). The value -1 indicates background or removal in NMS. The second element is the confidence level of prediction bounding box. The remaining four elements are the $x, y$ axis coordinates of the upper-left corner and the $x, y$ axis coordinates of the lower-right corner of the prediction bounding box (the value range is between 0 and 1). | ||
|
||
```{.python .input n=13} | ||
|
@@ -209,6 +417,69 @@ output = npx.multibox_detection( | |
nms_threshold=0.5) | ||
output | ||
``` | ||
```{.python .input} | ||
#@tab pytorch | ||
#@save | ||
PredBoundingBox = namedtuple("PredBoundingBox", ["probability", "class_id", "classname", "bounding_box"]) | ||
|
||
def non_max_suppression(bounding_boxes: list, iou_threshold: float = 0.1) -> list: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nms not possible through |
||
filtered_bb = [] | ||
|
||
while len(bounding_boxes) != 0: | ||
best_bb = bounding_boxes.pop(0) | ||
filtered_bb.append(best_bb) | ||
|
||
remove_items = [] | ||
for bb in bounding_boxes: | ||
iou = jaccard(torch.tensor(best_bb.bounding_box).unsqueeze(0), | ||
torch.tensor(bb.bounding_box).unsqueeze(0)) | ||
|
||
if iou > iou_threshold: | ||
remove_items.append(bb) | ||
bounding_boxes = [bb for bb in bounding_boxes if bb not in remove_items] | ||
return filtered_bb | ||
|
||
def multibox_detection(id_cat, cls_probs, anchors, nms_threshold): | ||
|
||
id_new = dict() | ||
id_new[0] = 'background' | ||
for i in (id_cat.keys()): | ||
id_new[i+1] = id_cat[i] | ||
|
||
cls_probs = cls_probs.transpose(0,1) | ||
|
||
prob, class_id = torch.max(cls_probs,1) | ||
|
||
prob = prob.detach().cpu().numpy() | ||
class_id = class_id.detach().cpu().numpy() | ||
|
||
output_bb = [PredBoundingBox(probability=prob[i], | ||
class_id=class_id[i], | ||
classname=id_new[class_id[i]], | ||
bounding_box=[anchors[i, 0], | ||
anchors[i, 1], | ||
anchors[i, 2], | ||
anchors[i, 3]]) | ||
for i in range(0, len(prob))] | ||
|
||
filtered_bb = non_max_suppression(output_bb, nms_threshold) | ||
|
||
out = [] | ||
for bb in filtered_bb: | ||
out.append([bb.class_id-1, bb.probability, *bb.bounding_box]) | ||
out = torch.Tensor(out) | ||
|
||
return out | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
id_category = dict() | ||
id_category[0] = 'dog' | ||
id_category[1] = 'cat' | ||
output = multibox_detection(id_category, cls_probs, anchors, nms_threshold=0.5) | ||
output | ||
``` | ||
|
||
We remove the prediction bounding boxes of category -1 and visualize the results retained by NMS. | ||
|
||
|
@@ -221,6 +492,15 @@ for i in output[0].asnumpy(): | |
show_bboxes(fig.axes, [np.array(i[2:]) * bbox_scale], label) | ||
``` | ||
|
||
```{.python .input} | ||
#@tab pytorch | ||
# The predicted bounding boxes with category -1 are already removed in d2l.MultiboxDetection() | ||
fig = d2l.plt.imshow(img) | ||
for i in output.numpy(): | ||
label = ('dog=', 'cat=')[int(i[0])] + str(i[1]) | ||
show_bboxes(fig.axes, [d2l.tensor(i[2:]) * bbox_scale], label) | ||
``` | ||
|
||
In practice, we can remove prediction bounding boxes with lower confidence levels before performing NMS, thereby reducing the amount of computation for NMS. We can also filter the output of NMS, for example, by only retaining results with higher confidence levels as the final output. | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here it is right that you define this function because torch/torchvision don't offer a built-in method for multibox_prior like mxnet.multibox prior