Spaces:
Sleeping
Sleeping
| import torch | |
| from torch import nn | |
| import numpy as np | |
| import os | |
| from .utils.detect_face import detect_face, extract_face | |
| class PNet(nn.Module): | |
| """MTCNN PNet. | |
| Keyword Arguments: | |
| pretrained {bool} -- Whether or not to load saved pretrained weights (default: {True}) | |
| """ | |
| def __init__(self, pretrained=True): | |
| super().__init__() | |
| self.conv1 = nn.Conv2d(3, 10, kernel_size=3) | |
| self.prelu1 = nn.PReLU(10) | |
| self.pool1 = nn.MaxPool2d(2, 2, ceil_mode=True) | |
| self.conv2 = nn.Conv2d(10, 16, kernel_size=3) | |
| self.prelu2 = nn.PReLU(16) | |
| self.conv3 = nn.Conv2d(16, 32, kernel_size=3) | |
| self.prelu3 = nn.PReLU(32) | |
| self.conv4_1 = nn.Conv2d(32, 2, kernel_size=1) | |
| self.softmax4_1 = nn.Softmax(dim=1) | |
| self.conv4_2 = nn.Conv2d(32, 4, kernel_size=1) | |
| self.training = False | |
| if pretrained: | |
| state_dict_path = os.path.join(os.path.dirname(__file__), '../data/pnet.pt') | |
| state_dict = torch.load(state_dict_path) | |
| self.load_state_dict(state_dict) | |
| def forward(self, x): | |
| x = self.conv1(x) | |
| x = self.prelu1(x) | |
| x = self.pool1(x) | |
| x = self.conv2(x) | |
| x = self.prelu2(x) | |
| x = self.conv3(x) | |
| x = self.prelu3(x) | |
| a = self.conv4_1(x) | |
| a = self.softmax4_1(a) | |
| b = self.conv4_2(x) | |
| return b, a | |
| class RNet(nn.Module): | |
| """MTCNN RNet. | |
| Keyword Arguments: | |
| pretrained {bool} -- Whether or not to load saved pretrained weights (default: {True}) | |
| """ | |
| def __init__(self, pretrained=True): | |
| super().__init__() | |
| self.conv1 = nn.Conv2d(3, 28, kernel_size=3) | |
| self.prelu1 = nn.PReLU(28) | |
| self.pool1 = nn.MaxPool2d(3, 2, ceil_mode=True) | |
| self.conv2 = nn.Conv2d(28, 48, kernel_size=3) | |
| self.prelu2 = nn.PReLU(48) | |
| self.pool2 = nn.MaxPool2d(3, 2, ceil_mode=True) | |
| self.conv3 = nn.Conv2d(48, 64, kernel_size=2) | |
| self.prelu3 = nn.PReLU(64) | |
| self.dense4 = nn.Linear(576, 128) | |
| self.prelu4 = nn.PReLU(128) | |
| self.dense5_1 = nn.Linear(128, 2) | |
| self.softmax5_1 = nn.Softmax(dim=1) | |
| self.dense5_2 = nn.Linear(128, 4) | |
| self.training = False | |
| if pretrained: | |
| state_dict_path = os.path.join(os.path.dirname(__file__), '../data/rnet.pt') | |
| state_dict = torch.load(state_dict_path) | |
| self.load_state_dict(state_dict) | |
| def forward(self, x): | |
| x = self.conv1(x) | |
| x = self.prelu1(x) | |
| x = self.pool1(x) | |
| x = self.conv2(x) | |
| x = self.prelu2(x) | |
| x = self.pool2(x) | |
| x = self.conv3(x) | |
| x = self.prelu3(x) | |
| x = x.permute(0, 3, 2, 1).contiguous() | |
| x = self.dense4(x.view(x.shape[0], -1)) | |
| x = self.prelu4(x) | |
| a = self.dense5_1(x) | |
| a = self.softmax5_1(a) | |
| b = self.dense5_2(x) | |
| return b, a | |
| class ONet(nn.Module): | |
| """MTCNN ONet. | |
| Keyword Arguments: | |
| pretrained {bool} -- Whether or not to load saved pretrained weights (default: {True}) | |
| """ | |
| def __init__(self, pretrained=True): | |
| super().__init__() | |
| self.conv1 = nn.Conv2d(3, 32, kernel_size=3) | |
| self.prelu1 = nn.PReLU(32) | |
| self.pool1 = nn.MaxPool2d(3, 2, ceil_mode=True) | |
| self.conv2 = nn.Conv2d(32, 64, kernel_size=3) | |
| self.prelu2 = nn.PReLU(64) | |
| self.pool2 = nn.MaxPool2d(3, 2, ceil_mode=True) | |
| self.conv3 = nn.Conv2d(64, 64, kernel_size=3) | |
| self.prelu3 = nn.PReLU(64) | |
| self.pool3 = nn.MaxPool2d(2, 2, ceil_mode=True) | |
| self.conv4 = nn.Conv2d(64, 128, kernel_size=2) | |
| self.prelu4 = nn.PReLU(128) | |
| self.dense5 = nn.Linear(1152, 256) | |
| self.prelu5 = nn.PReLU(256) | |
| self.dense6_1 = nn.Linear(256, 2) | |
| self.softmax6_1 = nn.Softmax(dim=1) | |
| self.dense6_2 = nn.Linear(256, 4) | |
| self.dense6_3 = nn.Linear(256, 10) | |
| self.training = False | |
| if pretrained: | |
| state_dict_path = os.path.join(os.path.dirname(__file__), '../data/onet.pt') | |
| state_dict = torch.load(state_dict_path) | |
| self.load_state_dict(state_dict) | |
| def forward(self, x): | |
| x = self.conv1(x) | |
| x = self.prelu1(x) | |
| x = self.pool1(x) | |
| x = self.conv2(x) | |
| x = self.prelu2(x) | |
| x = self.pool2(x) | |
| x = self.conv3(x) | |
| x = self.prelu3(x) | |
| x = self.pool3(x) | |
| x = self.conv4(x) | |
| x = self.prelu4(x) | |
| x = x.permute(0, 3, 2, 1).contiguous() | |
| x = self.dense5(x.view(x.shape[0], -1)) | |
| x = self.prelu5(x) | |
| a = self.dense6_1(x) | |
| a = self.softmax6_1(a) | |
| b = self.dense6_2(x) | |
| c = self.dense6_3(x) | |
| return b, c, a | |
| class MTCNN(nn.Module): | |
| """MTCNN face detection module. | |
| This class loads pretrained P-, R-, and O-nets and returns images cropped to include the face | |
| only, given raw input images of one of the following types: | |
| - PIL image or list of PIL images | |
| - numpy.ndarray (uint8) representing either a single image (3D) or a batch of images (4D). | |
| Cropped faces can optionally be saved to file | |
| also. | |
| Keyword Arguments: | |
| image_size {int} -- Output image size in pixels. The image will be square. (default: {160}) | |
| margin {int} -- Margin to add to bounding box, in terms of pixels in the final image. | |
| Note that the application of the margin differs slightly from the davidsandberg/facenet | |
| repo, which applies the margin to the original image before resizing, making the margin | |
| dependent on the original image size (this is a bug in davidsandberg/facenet). | |
| (default: {0}) | |
| min_face_size {int} -- Minimum face size to search for. (default: {20}) | |
| thresholds {list} -- MTCNN face detection thresholds (default: {[0.6, 0.7, 0.7]}) | |
| factor {float} -- Factor used to create a scaling pyramid of face sizes. (default: {0.709}) | |
| post_process {bool} -- Whether or not to post process images tensors before returning. | |
| (default: {True}) | |
| select_largest {bool} -- If True, if multiple faces are detected, the largest is returned. | |
| If False, the face with the highest detection probability is returned. | |
| (default: {True}) | |
| selection_method {string} -- Which heuristic to use for selection. Default None. If | |
| specified, will override select_largest: | |
| "probability": highest probability selected | |
| "largest": largest box selected | |
| "largest_over_threshold": largest box over a certain probability selected | |
| "center_weighted_size": box size minus weighted squared offset from image center | |
| (default: {None}) | |
| keep_all {bool} -- If True, all detected faces are returned, in the order dictated by the | |
| select_largest parameter. If a save_path is specified, the first face is saved to that | |
| path and the remaining faces are saved to <save_path>1, <save_path>2 etc. | |
| (default: {False}) | |
| device {torch.device} -- The device on which to run neural net passes. Image tensors and | |
| models are copied to this device before running forward passes. (default: {None}) | |
| """ | |
| def __init__( | |
| self, image_size=160, margin=0, min_face_size=20, | |
| thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True, | |
| select_largest=True, selection_method=None, keep_all=False, device=None | |
| ): | |
| super().__init__() | |
| self.image_size = image_size | |
| self.margin = margin | |
| self.min_face_size = min_face_size | |
| self.thresholds = thresholds | |
| self.factor = factor | |
| self.post_process = post_process | |
| self.select_largest = select_largest | |
| self.keep_all = keep_all | |
| self.selection_method = selection_method | |
| self.pnet = PNet() | |
| self.rnet = RNet() | |
| self.onet = ONet() | |
| self.device = torch.device('cpu') | |
| if device is not None: | |
| self.device = device | |
| self.to(device) | |
| if not self.selection_method: | |
| self.selection_method = 'largest' if self.select_largest else 'probability' | |
| def forward(self, img, save_path=None, return_prob=False): | |
| """Run MTCNN face detection on a PIL image or numpy array. This method performs both | |
| detection and extraction of faces, returning tensors representing detected faces rather | |
| than the bounding boxes. To access bounding boxes, see the MTCNN.detect() method below. | |
| Arguments: | |
| img {PIL.Image, np.ndarray, or list} -- A PIL image, np.ndarray, torch.Tensor, or list. | |
| Keyword Arguments: | |
| save_path {str} -- An optional save path for the cropped image. Note that when | |
| self.post_process=True, although the returned tensor is post processed, the saved | |
| face image is not, so it is a true representation of the face in the input image. | |
| If `img` is a list of images, `save_path` should be a list of equal length. | |
| (default: {None}) | |
| return_prob {bool} -- Whether or not to return the detection probability. | |
| (default: {False}) | |
| Returns: | |
| Union[torch.Tensor, tuple(torch.tensor, float)] -- If detected, cropped image of a face | |
| with dimensions 3 x image_size x image_size. Optionally, the probability that a | |
| face was detected. If self.keep_all is True, n detected faces are returned in an | |
| n x 3 x image_size x image_size tensor with an optional list of detection | |
| probabilities. If `img` is a list of images, the item(s) returned have an extra | |
| dimension (batch) as the first dimension. | |
| Example: | |
| >>> from facenet_pytorch import MTCNN | |
| >>> mtcnn = MTCNN() | |
| >>> face_tensor, prob = mtcnn(img, save_path='face.png', return_prob=True) | |
| """ | |
| # Detect faces | |
| batch_boxes, batch_probs, batch_points = self.detect(img, landmarks=True) | |
| # Select faces | |
| if not self.keep_all: | |
| batch_boxes, batch_probs, batch_points = self.select_boxes( | |
| batch_boxes, batch_probs, batch_points, img, method=self.selection_method | |
| ) | |
| # Extract faces | |
| faces = self.extract(img, batch_boxes, save_path) | |
| if return_prob: | |
| return faces, batch_probs | |
| else: | |
| return faces | |
| def detect(self, img, landmarks=False): | |
| """Detect all faces in PIL image and return bounding boxes and optional facial landmarks. | |
| This method is used by the forward method and is also useful for face detection tasks | |
| that require lower-level handling of bounding boxes and facial landmarks (e.g., face | |
| tracking). The functionality of the forward function can be emulated by using this method | |
| followed by the extract_face() function. | |
| Arguments: | |
| img {PIL.Image, np.ndarray, or list} -- A PIL image, np.ndarray, torch.Tensor, or list. | |
| Keyword Arguments: | |
| landmarks {bool} -- Whether to return facial landmarks in addition to bounding boxes. | |
| (default: {False}) | |
| Returns: | |
| tuple(numpy.ndarray, list) -- For N detected faces, a tuple containing an | |
| Nx4 array of bounding boxes and a length N list of detection probabilities. | |
| Returned boxes will be sorted in descending order by detection probability if | |
| self.select_largest=False, otherwise the largest face will be returned first. | |
| If `img` is a list of images, the items returned have an extra dimension | |
| (batch) as the first dimension. Optionally, a third item, the facial landmarks, | |
| are returned if `landmarks=True`. | |
| Example: | |
| >>> from PIL import Image, ImageDraw | |
| >>> from facenet_pytorch import MTCNN, extract_face | |
| >>> mtcnn = MTCNN(keep_all=True) | |
| >>> boxes, probs, points = mtcnn.detect(img, landmarks=True) | |
| >>> # Draw boxes and save faces | |
| >>> img_draw = img.copy() | |
| >>> draw = ImageDraw.Draw(img_draw) | |
| >>> for i, (box, point) in enumerate(zip(boxes, points)): | |
| ... draw.rectangle(box.tolist(), width=5) | |
| ... for p in point: | |
| ... draw.rectangle((p - 10).tolist() + (p + 10).tolist(), width=10) | |
| ... extract_face(img, box, save_path='detected_face_{}.png'.format(i)) | |
| >>> img_draw.save('annotated_faces.png') | |
| """ | |
| with torch.no_grad(): | |
| batch_boxes, batch_points = detect_face( | |
| img, self.min_face_size, | |
| self.pnet, self.rnet, self.onet, | |
| self.thresholds, self.factor, | |
| self.device | |
| ) | |
| boxes, probs, points = [], [], [] | |
| for box, point in zip(batch_boxes, batch_points): | |
| box = np.array(box) | |
| point = np.array(point) | |
| if len(box) == 0: | |
| boxes.append(None) | |
| probs.append([None]) | |
| points.append(None) | |
| elif self.select_largest: | |
| box_order = np.argsort((box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1]))[::-1] | |
| box = box[box_order] | |
| point = point[box_order] | |
| boxes.append(box[:, :4]) | |
| probs.append(box[:, 4]) | |
| points.append(point) | |
| else: | |
| boxes.append(box[:, :4]) | |
| probs.append(box[:, 4]) | |
| points.append(point) | |
| boxes = np.array(boxes, dtype=object) | |
| probs = np.array(probs, dtype=object) | |
| points = np.array(points, dtype=object) | |
| if ( | |
| not isinstance(img, (list, tuple)) and | |
| not (isinstance(img, np.ndarray) and len(img.shape) == 4) and | |
| not (isinstance(img, torch.Tensor) and len(img.shape) == 4) | |
| ): | |
| boxes = boxes[0] | |
| probs = probs[0] | |
| points = points[0] | |
| if landmarks: | |
| return boxes, probs, points | |
| return boxes, probs | |
| def select_boxes( | |
| self, all_boxes, all_probs, all_points, imgs, method='probability', threshold=0.9, | |
| center_weight=2.0 | |
| ): | |
| """Selects a single box from multiple for a given image using one of multiple heuristics. | |
| Arguments: | |
| all_boxes {np.ndarray} -- Ix0 ndarray where each element is a Nx4 ndarry of | |
| bounding boxes for N detected faces in I images (output from self.detect). | |
| all_probs {np.ndarray} -- Ix0 ndarray where each element is a Nx0 ndarry of | |
| probabilities for N detected faces in I images (output from self.detect). | |
| all_points {np.ndarray} -- Ix0 ndarray where each element is a Nx5x2 array of | |
| points for N detected faces. (output from self.detect). | |
| imgs {PIL.Image, np.ndarray, or list} -- A PIL image, np.ndarray, torch.Tensor, or list. | |
| Keyword Arguments: | |
| method {str} -- Which heuristic to use for selection: | |
| "probability": highest probability selected | |
| "largest": largest box selected | |
| "largest_over_theshold": largest box over a certain probability selected | |
| "center_weighted_size": box size minus weighted squared offset from image center | |
| (default: {'probability'}) | |
| threshold {float} -- theshold for "largest_over_threshold" method. (default: {0.9}) | |
| center_weight {float} -- weight for squared offset in center weighted size method. | |
| (default: {2.0}) | |
| Returns: | |
| tuple(numpy.ndarray, numpy.ndarray, numpy.ndarray) -- nx4 ndarray of bounding boxes | |
| for n images. Ix0 array of probabilities for each box, array of landmark points. | |
| """ | |
| #copying batch detection from extract, but would be easier to ensure detect creates consistent output. | |
| batch_mode = True | |
| if ( | |
| not isinstance(imgs, (list, tuple)) and | |
| not (isinstance(imgs, np.ndarray) and len(imgs.shape) == 4) and | |
| not (isinstance(imgs, torch.Tensor) and len(imgs.shape) == 4) | |
| ): | |
| imgs = [imgs] | |
| all_boxes = [all_boxes] | |
| all_probs = [all_probs] | |
| all_points = [all_points] | |
| batch_mode = False | |
| selected_boxes, selected_probs, selected_points = [], [], [] | |
| for boxes, points, probs, img in zip(all_boxes, all_points, all_probs, imgs): | |
| if boxes is None: | |
| selected_boxes.append(None) | |
| selected_probs.append([None]) | |
| selected_points.append(None) | |
| continue | |
| # If at least 1 box found | |
| boxes = np.array(boxes) | |
| probs = np.array(probs) | |
| points = np.array(points) | |
| if method == 'largest': | |
| box_order = np.argsort((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]))[::-1] | |
| elif method == 'probability': | |
| box_order = np.argsort(probs)[::-1] | |
| elif method == 'center_weighted_size': | |
| box_sizes = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) | |
| img_center = (img.width / 2, img.height/2) | |
| box_centers = np.array(list(zip((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2))) | |
| offsets = box_centers - img_center | |
| offset_dist_squared = np.sum(np.power(offsets, 2.0), 1) | |
| box_order = np.argsort(box_sizes - offset_dist_squared * center_weight)[::-1] | |
| elif method == 'largest_over_threshold': | |
| box_mask = probs > threshold | |
| boxes = boxes[box_mask] | |
| box_order = np.argsort((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]))[::-1] | |
| if sum(box_mask) == 0: | |
| selected_boxes.append(None) | |
| selected_probs.append([None]) | |
| selected_points.append(None) | |
| continue | |
| box = boxes[box_order][[0]] | |
| prob = probs[box_order][[0]] | |
| point = points[box_order][[0]] | |
| selected_boxes.append(box) | |
| selected_probs.append(prob) | |
| selected_points.append(point) | |
| if batch_mode: | |
| selected_boxes = np.array(selected_boxes) | |
| selected_probs = np.array(selected_probs) | |
| selected_points = np.array(selected_points) | |
| else: | |
| selected_boxes = selected_boxes[0] | |
| selected_probs = selected_probs[0][0] | |
| selected_points = selected_points[0] | |
| return selected_boxes, selected_probs, selected_points | |
| def extract(self, img, batch_boxes, save_path): | |
| # Determine if a batch or single image was passed | |
| batch_mode = True | |
| if ( | |
| not isinstance(img, (list, tuple)) and | |
| not (isinstance(img, np.ndarray) and len(img.shape) == 4) and | |
| not (isinstance(img, torch.Tensor) and len(img.shape) == 4) | |
| ): | |
| img = [img] | |
| batch_boxes = [batch_boxes] | |
| batch_mode = False | |
| # Parse save path(s) | |
| if save_path is not None: | |
| if isinstance(save_path, str): | |
| save_path = [save_path] | |
| else: | |
| save_path = [None for _ in range(len(img))] | |
| # Process all bounding boxes | |
| faces = [] | |
| for im, box_im, path_im in zip(img, batch_boxes, save_path): | |
| if box_im is None: | |
| faces.append(None) | |
| continue | |
| if not self.keep_all: | |
| box_im = box_im[[0]] | |
| faces_im = [] | |
| for i, box in enumerate(box_im): | |
| face_path = path_im | |
| if path_im is not None and i > 0: | |
| save_name, ext = os.path.splitext(path_im) | |
| face_path = save_name + '_' + str(i + 1) + ext | |
| face = extract_face(im, box, self.image_size, self.margin, face_path) | |
| if self.post_process: | |
| face = fixed_image_standardization(face) | |
| faces_im.append(face) | |
| if self.keep_all: | |
| faces_im = torch.stack(faces_im) | |
| else: | |
| faces_im = faces_im[0] | |
| faces.append(faces_im) | |
| if not batch_mode: | |
| faces = faces[0] | |
| return faces | |
| def fixed_image_standardization(image_tensor): | |
| processed_tensor = (image_tensor - 127.5) / 128.0 | |
| return processed_tensor | |
| def prewhiten(x): | |
| mean = x.mean() | |
| std = x.std() | |
| std_adj = std.clamp(min=1.0/(float(x.numel())**0.5)) | |
| y = (x - mean) / std_adj | |
| return y | |