opencv · obtx · Jul 19, 2022 · Jul 29, 2022 · Jul 29, 2022 · Aug 6, 2022
diff --git a/models/multitask_centernet/LICENSE b/models/multitask_centernet/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Sida Yi <[email protected]>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/models/multitask_centernet/README.md b/models/multitask_centernet/README.md
@@ -0,0 +1,30 @@
+# MCN
+
+Multitask-Centernet (MCN)  is a multi-task network (MTN). Studies have shown that training with multiple tasks linked to each other can sometimes even improve the quality of training and prediction compared to single-task learning (STL). When the network receives the same type of input, it is likely to extract similar features. In this case, a shared backbone can take advantage of the similar semantics of these input features.
+
+Notes:
+- Model source: [here](https://drive.google.com/file/d/1HmYZ_HccS41kolqW9KHfcKEQKjXSBZnY/view?usp=sharing).
+- For details on training this model, please visit my home page
+
+## Demo
+
+Run the following command to try the demo:
+```shell
+# detect on an image
+python demo.py --input /path/to/image
+```
+
+### Example outputs
+
+![detection and pose estimation demo](./examples/ori_vis_0.png)
+
+![semantic segmentation demo](./examples/ori_vis_masks_0.png)
+
+## License
+
+All files in this directory are licensed under [MIT License](./LICENSE).
+
+## Reference
+
+- https://arxiv.org/abs/2108.05060v2
+
diff --git a/models/multitask_centernet/class.names b/models/multitask_centernet/class.names
@@ -0,0 +1,18 @@
+person
+nose
+left_eye
+right_eye
+left_ear
+right_ear
+left_shoulder
+right_shoulder
+left_elbow
+right_elbow
+left_wrist
+right_wrist
+left_hip
+right_hip
+left_knee
+right_knee
+left_ankle
+right_ankle
diff --git a/models/multitask_centernet/demo.py b/models/multitask_centernet/demo.py
@@ -0,0 +1,15 @@
+import cv2
+import argparse
+import numpy as np
+from multitask_centernet import MCN
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--imgpath', type=str, default='images/d2645891.jpg', help="image path")
+    parser.add_argument('--modelpath', type=str, default='MCN.onnx')
+    args = parser.parse_args()
+
+    mcn = MCN(args.modelpath)
+    srcimg = cv2.imread(args.imgpath)
+    srcimg = mcn.detect(srcimg)
+    cv2.imwrite('result.png', srcimg)
diff --git a/models/multitask_centernet/examples/ori_vis_0.png b/models/multitask_centernet/examples/ori_vis_0.png
diff --git a/models/multitask_centernet/examples/ori_vis_5.png b/models/multitask_centernet/examples/ori_vis_5.png
diff --git a/models/multitask_centernet/examples/ori_vis_masks_0.png b/models/multitask_centernet/examples/ori_vis_masks_0.png
diff --git a/models/multitask_centernet/examples/ori_vis_masks_5.png b/models/multitask_centernet/examples/ori_vis_masks_5.png
diff --git a/models/multitask_centernet/multitask_centernet.py b/models/multitask_centernet/multitask_centernet.py
@@ -0,0 +1,210 @@
+import cv2
+import argparse
+import numpy as np
+
+config = {'person_conf_thres': 0.7, 'person_iou_thres': 0.45, 'kp_conf_thres': 0.5,
+          'kp_iou_thres': 0.45, 'conf_thres_kp_person': 0.2, 'overwrite_tol': 25,
+          'kp_face': [0, 1, 2, 3, 4], 'use_kp_dets': True,
+          'segments': {1: [5, 6], 2: [5, 11], 3: [11, 12], 4: [12, 6], 5: [5, 7], 6: [7, 9], 7: [6, 8], 8: [8, 10],
+                       9: [11, 13], 10: [13, 15], 11: [12, 14], 12: [14, 16]},
+          'crowd_segments':{1: [0, 13], 2: [1, 13], 3: [0, 2], 4: [2, 4], 5: [1, 3], 6: [3, 5], 7: [0, 6], 8: [6, 7], 9: [7, 1], 10: [6, 8], 11: [8, 10], 12: [7, 9], 13: [9, 11], 14: [12, 13]},
+          'crowd_kp_face':[]}
+
+class MCN():
+    def __init__(self, modelpath):
+        with open('class.names', 'rt') as f:
+            self.classes = f.read().rstrip('\n').split('\n')
+            self.lines = config['segments']
+            self.kp_face = config['kp_face']
+
+        self.num_classes = len(self.classes)
+        self.inpHeight, self.inpWidth = 1280, 1280
+        anchors = [[19, 27, 44, 40, 38, 94], [96, 68, 86, 152, 180, 137], [140, 301, 303, 264, 238, 542],
+                   [436, 615, 739, 380, 925, 792]]
+        self.stride = np.array([8., 16., 32., 64.])
+        self.nl = len(anchors)
+        self.na = len(anchors[0]) // 2
+        self.grid = [np.zeros(1)] * self.nl
+        self.anchor_grid = np.asarray(anchors, dtype=np.float32).reshape(self.nl, -1, 2)
+        self.net = cv2.dnn.readNet(modelpath)
+        self._inputNames = ''
+        self.last_ind = 5 + self.num_classes
+
+    def resize_image(self, srcimg, keep_ratio=True, dynamic=False):
+        top, left, newh, neww = 0, 0, self.inpWidth, self.inpHeight
+        if keep_ratio and srcimg.shape[0] != srcimg.shape[1]:
+            hw_scale = srcimg.shape[0] / srcimg.shape[1]
+            if hw_scale > 1:
+                newh, neww = self.inpHeight, int(self.inpWidth / hw_scale)
+                img = cv2.resize(srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
+                if not dynamic:
+                    left = int((self.inpWidth - neww) * 0.5)
+                    img = cv2.copyMakeBorder(img, 0, 0, left, self.inpWidth - neww - left, cv2.BORDER_CONSTANT,
+                                             value=(114, 114, 114))  # add border
+            else:
+                newh, neww = int(self.inpHeight * hw_scale), self.inpWidth
+                img = cv2.resize(srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
+                if not dynamic:
+                    top = int((self.inpHeight - newh) * 0.5)
+                    img = cv2.copyMakeBorder(img, top, self.inpHeight - newh - top, 0, 0, cv2.BORDER_CONSTANT,
+                                             value=(114, 114, 114))
+        else:
+            img = cv2.resize(srcimg, (self.inpWidth, self.inpHeight), interpolation=cv2.INTER_AREA)
+        return img, newh, neww, top, left
+
+    def _make_grid(self, nx=20, ny=20):
+        xv, yv = np.meshgrid(np.arange(ny), np.arange(nx))
+        return np.stack((xv, yv), 2).reshape((-1, 2)).astype(np.float32)
+
+    def preprocess(self, img):
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = img.astype(np.float32) / 255.0
+        return img
+
+    def postprocess(self, frame, outs, padsize=None):
+        frameHeight = frame.shape[0]
+        frameWidth = frame.shape[1]
+        newh, neww, padh, padw = padsize
+        ratioh, ratiow = frameHeight / newh, frameWidth / neww
+        # Scan through all the bounding boxes output from the network and keep only the
+        # ones with high confidence scores. Assign the box's class label as the class with the highest score.
+
+        person_confidences, kp_confidences = [], []
+        person_boxes, kp_boxes = [], []
+        person_classIds, kp_classIds = [], []
+        person_rowinds = []
+        for i in range(outs.shape[0]):
+            detection = outs[i, :]
+            scores = detection[5:self.last_ind]
+            classId = np.argmax(scores)
+            confidence = scores[classId] * detection[4]
+            if classId == 0:
+                if detection[4] > config['person_conf_thres'] and confidence > config['person_conf_thres']:
+                    center_x = int((detection[0] - padw) * ratiow)
+                    center_y = int((detection[1] - padh) * ratioh)
+                    width = int(detection[2] * ratiow)
+                    height = int(detection[3] * ratioh)
+                    left = int(center_x - width * 0.5)
+                    top = int(center_y - height * 0.5)
+
+                    person_confidences.append(float(confidence))
+                    person_boxes.append([left, top, width, height])
+                    person_classIds.append(classId)
+                    person_rowinds.append(i)
+            else:
+                if detection[4] > config['kp_conf_thres'] and confidence > config['kp_conf_thres']:
+                    center_x = int((detection[0] - padw) * ratiow)
+                    center_y = int((detection[1] - padh) * ratioh)
+                    width = int(detection[2] * ratiow)
+                    height = int(detection[3] * ratioh)
+                    left = int(center_x - width * 0.5)
+                    top = int(center_y - height * 0.5)
+
+                    kp_confidences.append(float(confidence))
+                    kp_boxes.append([left, top, width, height])
+                    kp_classIds.append(classId)
+
+        # Perform non maximum suppression to eliminate redundant overlapping boxes with
+        # lower confidences.
+        # print(person_boxes)
+        if len(person_boxes) == 0:
+            return frame
+        person_indices = cv2.dnn.NMSBoxes(person_boxes, person_confidences, config['person_conf_thres'],
+                                          config['person_iou_thres']).flatten()
+        kp_indices = cv2.dnn.NMSBoxes(kp_boxes, kp_confidences, config['kp_conf_thres'],
+                                      config['kp_iou_thres']).flatten()
+
+        poses = []
+        for i in person_indices:
+            if person_confidences[i] > config['conf_thres_kp_person']:
+                pose = outs[person_rowinds[i], self.last_ind:].reshape((-1, 2))
+                pose[:, 0] = (pose[:, 0] - padw) * ratiow
+                pose[:, 1] = (pose[:, 1] - padh) * ratioh
+                poses.append(pose)
+        nd = len(poses)
+        poses = np.array(poses)
+        poses = np.concatenate((poses, np.zeros((nd, poses.shape[1], 1))), axis=-1)
+        for j in kp_indices:
+            box = kp_boxes[j]
+            x = box[0] + 0.5 * box[2]
+            y = box[1] + 0.5 * box[3]
+            pt_id = kp_classIds[j] - 1
+            pose_kps = poses[:, pt_id, :]
+            dist = np.linalg.norm(pose_kps[:, :2] - np.array([[x, y]]), axis=-1)
+            kp_match = np.argmin(dist)
+            if kp_confidences[j] > pose_kps[kp_match, 2] and dist[kp_match] < config['overwrite_tol']:
+                poses[kp_match, pt_id, :] = np.array([x, y, kp_confidences[j]])
+
+        for i in person_indices:
+            box = person_boxes[i]
+            left = box[0]
+            top = box[1]
+            width = box[2]
+            height = box[3]
+            frame = self.drawPred(frame, person_classIds[i], person_confidences[i], left, top, left + width,
+                                  top + height)
+
+        for pose in poses:
+            for seg in self.lines.values():
+                pt1 = (int(pose[seg[0], 0]), int(pose[seg[0], 1]))
+                pt2 = (int(pose[seg[1], 0]), int(pose[seg[1], 1]))
+                cv2.line(frame, pt1, pt2, (255, 0, 255), 1)
+            for x, y, c in pose:
+                if c > 0:
+                    cv2.circle(frame, (int(x), int(y)), 1, (0, 0, 255), 1)
+
+            #for x, y, c in pose[self.kp_face]:
+                #cv2.circle(frame, (int(x), int(y)), 1, (255, 0, 255), 1)
+        # for i in kp_indices:
+        #     box = kp_boxes[i]
+        #     left = box[0]
+        #     top = box[1]
+        #     width = box[2]
+        #     height = box[3]
+        #     frame = self.drawPred(frame, kp_classIds[i], kp_confidences[i], left, top, left + width, top + height)
+        return frame
+
+    def drawPred(self, frame, classId, conf, left, top, right, bottom):
+        # Draw a bounding box.
+        cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), thickness=1)
+
+        label = '%.2f' % conf
+        label = '%s:%s' % (self.classes[classId], label)
+
+        # Display the label at the top of the bounding box
+        labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+        top = max(top, labelSize[1])
+        # cv.rectangle(frame, (left, top - round(1.5 * labelSize[1])), (left + round(1.5 * labelSize[0]), top + baseLine), (255,255,255), cv.FILLED)
+        cv2.putText(frame, label, (left, top - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), thickness=1)
+        return frame
+
+    def detect(self, srcimg):
+        img, newh, neww, padh, padw = self.resize_image(srcimg)
+        blob = cv2.dnn.blobFromImage(img, scalefactor=1 / 255.0, swapRB=True)
+        # blob = cv2.dnn.blobFromImage(self.preprocess(img))
+        # Sets the input to the network
+        self.net.setInput(blob, self._inputNames)
+
+        # Runs the forward pass to get output of the output layers
+        outs = self.net.forward(self.net.getUnconnectedOutLayersNames())[0].squeeze(axis=0)
+
+        # inference output
+        row_ind = 0
+        for i in range(self.nl):
+            h, w = int(self.inpHeight / self.stride[i]), int(self.inpWidth / self.stride[i])
+            length = int(self.na * h * w)
+            if self.grid[i].shape[2:4] != (h, w):
+                self.grid[i] = self._make_grid(w, h)
+
+            outs[row_ind:row_ind + length, 0:2] = (outs[row_ind:row_ind + length, 0:2] * 2. - 0.5 + np.tile(
+                self.grid[i], (self.na, 1))) * int(self.stride[i])
+            outs[row_ind:row_ind + length, 2:4] = (outs[row_ind:row_ind + length, 2:4] * 2) ** 2 * np.repeat(
+                self.anchor_grid[i], h * w, axis=0)
+
+            self.num_coords = outs.shape[1] - self.last_ind
+            outs[row_ind:row_ind + length, self.last_ind:] = outs[row_ind:row_ind + length, self.last_ind:] * 4. - 2.
+            outs[row_ind:row_ind + length, self.last_ind:] *= np.tile(np.repeat(self.anchor_grid[i], h * w, axis=0), (1, self.num_coords//2))
+            outs[row_ind:row_ind + length, self.last_ind:] += np.tile(np.tile(self.grid[i], (self.na, 1)) * int(self.stride[i]), (1, self.num_coords//2))
+            row_ind += length
+        srcimg = self.postprocess(srcimg, outs, padsize=(newh, neww, padh, padw))
+        return srcimg
diff --git a/models/multitask_centernet/multitask_centernet_2022sep.onnx b/models/multitask_centernet/multitask_centernet_2022sep.onnx