diff --git a/DESCRIPTION b/DESCRIPTION index eb6990b..cacf82f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,5 +16,5 @@ LinkingTo: Rcpp Imports: Rcpp, magrittr LazyData: true Encoding: UTF-8 -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.0 Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 0d9dab4..281b878 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,7 @@ export(ocv_read) export(ocv_resize) export(ocv_sketch) export(ocv_stylize) +export(ocv_text) export(ocv_video) export(ocv_write) importFrom(Rcpp,sourceCpp) diff --git a/R/RcppExports.R b/R/RcppExports.R index 84d20c9..f14a751 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -105,3 +105,7 @@ cvmat_markers <- function(ptr) { .Call('_opencv_cvmat_markers', PACKAGE = 'opencv', ptr) } +text_detection <- function(input, confThreshold, nmsThreshold, inpWidth, inpHeight, model, draw) { + .Call('_opencv_text_detection', PACKAGE = 'opencv', input, confThreshold, nmsThreshold, inpWidth, inpHeight, model, draw) +} + diff --git a/R/text.R b/R/text.R new file mode 100644 index 0000000..b12a3b2 --- /dev/null +++ b/R/text.R @@ -0,0 +1,47 @@ +#' ocv_text text detection in images using EAST +#' +#' @description OpenCV sample using EAST (Efficient and Accurate Scene Text +#' Detector) to detect text in images. Requires the EAST pb-model not included +#' in the package. +#' +#' @param image opencv image to be processed. +#' @param width Preprocess input image by resizing to a specific width. It +#' should be multiple by 32. +#' @param height Preprocess input image by resizing to a specific height. It +#' should be multiple by 32. +#' @param thrs Confidence threshold between 0 and 1. +#' @param nms Non-maximum suppression threshold. +#' @param model Path to a binary .pb file contains trained network. +#' @param draw Draws visual output to the image. +#' +#' @examples +#' \dontrun{ +#' url <- paste0('https://upload.wikimedia.org/wikipedia/commons/6/6f/', +#' 'Keep-calm-and-carry-on-scan.jpg') +#' fl <- ocv_read(url) +#' +#' ocv_text(image = fl, thrs = 0.7, nms = 0.3, +#' model = "frozen_east_text_detection.pb") +#' } +#' +#' @references +#' https://github.com/opencv/opencv/blob/master/samples/dnn/text_detection.cpp +#' +#' @export +ocv_text <- function(image, thrs = 0.5, nms = 20, width = 320, height = 320, + model, draw = FALSE){ + + if(missing(model)) + stop("requires a model pb-file") + + model <- path.expand(model) + + text_detection(input = image, + confThreshold = thrs, + nmsThreshold = nms, + inpWidth = width, + inpHeight = height, + model = model, + draw = draw) +} + diff --git a/man/ocv_text.Rd b/man/ocv_text.Rd new file mode 100644 index 0000000..475bf78 --- /dev/null +++ b/man/ocv_text.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/text.R +\name{ocv_text} +\alias{ocv_text} +\title{ocv_text text detection in images using EAST} +\usage{ +ocv_text( + image, + thrs = 0.5, + nms = 20, + width = 320, + height = 320, + model, + draw = FALSE +) +} +\arguments{ +\item{image}{opencv image to be processed.} + +\item{thrs}{Confidence threshold between 0 and 1.} + +\item{nms}{Non-maximum suppression threshold.} + +\item{width}{Preprocess input image by resizing to a specific width. It +should be multiple by 32.} + +\item{height}{Preprocess input image by resizing to a specific height. It +should be multiple by 32.} + +\item{model}{Path to a binary .pb file contains trained network.} + +\item{draw}{Draws visual output to the image.} +} +\description{ +OpenCV sample using EAST (Efficient and Accurate Scene Text +Detector) to detect text in images. Requires the EAST pb-model not included +in the package. +} +\examples{ +\dontrun{ +url <- paste0('https://upload.wikimedia.org/wikipedia/commons/6/6f/', + 'Keep-calm-and-carry-on-scan.jpg') +fl <- ocv_read(url) + +ocv_text(image = fl, thrs = 0.7, nms = 0.3, + model = "frozen_east_text_detection.pb") +} + +} +\references{ +https://github.com/opencv/opencv/blob/master/samples/dnn/text_detection.cpp +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 4fe270b..e606488 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -296,6 +296,23 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// text_detection +XPtrMat text_detection(XPtrMat input, float confThreshold, float nmsThreshold, int inpWidth, int inpHeight, std::string model, bool draw); +RcppExport SEXP _opencv_text_detection(SEXP inputSEXP, SEXP confThresholdSEXP, SEXP nmsThresholdSEXP, SEXP inpWidthSEXP, SEXP inpHeightSEXP, SEXP modelSEXP, SEXP drawSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< XPtrMat >::type input(inputSEXP); + Rcpp::traits::input_parameter< float >::type confThreshold(confThresholdSEXP); + Rcpp::traits::input_parameter< float >::type nmsThreshold(nmsThresholdSEXP); + Rcpp::traits::input_parameter< int >::type inpWidth(inpWidthSEXP); + Rcpp::traits::input_parameter< int >::type inpHeight(inpHeightSEXP); + Rcpp::traits::input_parameter< std::string >::type model(modelSEXP); + Rcpp::traits::input_parameter< bool >::type draw(drawSEXP); + rcpp_result_gen = Rcpp::wrap(text_detection(input, confThreshold, nmsThreshold, inpWidth, inpHeight, model, draw)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_opencv_cvmat_destroy", (DL_FUNC) &_opencv_cvmat_destroy, 1}, @@ -324,6 +341,7 @@ static const R_CallMethodDef CallEntries[] = { {"_opencv_cvmat_edges", (DL_FUNC) &_opencv_cvmat_edges, 1}, {"_opencv_cvmat_hog", (DL_FUNC) &_opencv_cvmat_hog, 1}, {"_opencv_cvmat_markers", (DL_FUNC) &_opencv_cvmat_markers, 1}, + {"_opencv_text_detection", (DL_FUNC) &_opencv_text_detection, 7}, {NULL, NULL, 0} }; diff --git a/src/text.cpp b/src/text.cpp new file mode 100644 index 0000000..a25c7e0 --- /dev/null +++ b/src/text.cpp @@ -0,0 +1,171 @@ +#include "util.hpp" +#include + +#define OPENCV_VERSION (CV_VERSION_MAJOR * 10000 \ ++ CV_VERSION_MINOR * 100 \ ++ CV_VERSION_REVISION) + +#ifdef HAVE_OPENCV_DNN +#include "opencv2/dnn.hpp" +#endif + +using namespace cv; + +void decode(const Mat& scores, const Mat& geometry, float scoreThresh, + std::vector& detections, + std::vector& confidences) +{ +#if !defined(HAVE_OPENCV_DNN) || OPENCV_VERSION < 30403 + throw std::runtime_error("ocv_text req. OpenCV 3.4.3 or newer with DNN"); +#else + detections.clear(); + CV_Assert(scores.dims == 4); CV_Assert(geometry.dims == 4); + CV_Assert(scores.size[0] == 1); CV_Assert(geometry.size[0] == 1); + CV_Assert(scores.size[1] == 1); CV_Assert(geometry.size[1] == 5); + CV_Assert(scores.size[2] == geometry.size[2]); + CV_Assert(scores.size[3] == geometry.size[3]); + + const int height = scores.size[2]; + const int width = scores.size[3]; + for (int y = 0; y < height; ++y) + { + const float* scoresData = scores.ptr(0, 0, y); + const float* x0_data = geometry.ptr(0, 0, y); + const float* x1_data = geometry.ptr(0, 1, y); + const float* x2_data = geometry.ptr(0, 2, y); + const float* x3_data = geometry.ptr(0, 3, y); + const float* anglesData = geometry.ptr(0, 4, y); + for (int x = 0; x < width; ++x) + { + float score = scoresData[x]; + if (score < scoreThresh) + continue; + + // Decode a prediction. + // Multiple by 4 because feature maps are 4 time less than input image. + float offsetX = x * 4.0f, offsetY = y * 4.0f; + float angle = anglesData[x]; + float cosA = std::cos(angle); + float sinA = std::sin(angle); + float h = x0_data[x] + x2_data[x]; + float w = x1_data[x] + x3_data[x]; + + Point2f offset(offsetX + cosA * x1_data[x] + sinA * x2_data[x], + offsetY - sinA * x1_data[x] + cosA * x2_data[x]); + Point2f p1 = Point2f(-sinA * h, -cosA * h) + offset; + Point2f p3 = Point2f(-cosA * w, sinA * w) + offset; + RotatedRect r(0.5f * (p1 + p3), Size2f(w, h), + -angle * 180.0f / (float)CV_PI); + detections.push_back(r); + confidences.push_back(score); + } + } +#endif +} + + +// [[Rcpp::export]] +XPtrMat + text_detection(XPtrMat input, float confThreshold,float nmsThreshold, + int inpWidth, int inpHeight, std::string model, bool draw) + { +#if !defined(HAVE_OPENCV_DNN) || OPENCV_VERSION < 30403 + throw std::runtime_error("ocv_text req. OpenCV 3.4.3 or newer with DNN"); +#else + + + Mat frame, blob; + Mat inp = get_mat(input); + frame = inp.clone(); + + std::vector indices; + + std::vector outs; + std::vector outNames(2); + outNames[0] = "feature_fusion/Conv_7/Sigmoid"; + outNames[1] = "feature_fusion/concat_3"; + + // Load network. + cv::dnn::Net net = cv::dnn::readNet(model); + + cv::dnn::blobFromImage(frame, blob, 1.0, Size(inpWidth, inpHeight), + Scalar(123.68, 116.78, 103.94), true, false); + net.setInput(blob); + net.forward(outs, outNames); + + Mat scores = outs[0]; + Mat geometry = outs[1]; + + // Decode predicted bounding boxes. + std::vector boxes; + std::vector confidences; + decode(scores, geometry, confThreshold, boxes, confidences); + + // Apply non-maximum suppression procedure. + cv::dnn::NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices); + + // initiate output vetors + Rcpp::IntegerVector x1vec(indices.size()); + Rcpp::IntegerVector y1vec(indices.size()); + Rcpp::IntegerVector x2vec(indices.size()); + Rcpp::IntegerVector y2vec(indices.size()); + Rcpp::IntegerVector x3vec(indices.size()); + Rcpp::IntegerVector y3vec(indices.size()); + Rcpp::IntegerVector x4vec(indices.size()); + Rcpp::IntegerVector y4vec(indices.size()); + + // Render detections. + Point2f ratio((float)frame.cols / inpWidth, (float)frame.rows / inpHeight); + for (size_t i = 0; i < indices.size(); ++i) + { + RotatedRect& box = boxes[indices[i]]; + + Point2f vertices[4]; + box.points(vertices); + for (int j = 0; j < 4; ++j) + { + vertices[j].x *= ratio.x; + vertices[j].y *= ratio.y; + } + + x1vec.at(i) = vertices[0].x; + y1vec.at(i) = vertices[0].y; + x2vec.at(i) = vertices[1].x; + y2vec.at(i) = vertices[1].y; + x3vec.at(i) = vertices[2].x; + y3vec.at(i) = vertices[2].y; + x4vec.at(i) = vertices[3].x; + y4vec.at(i) = vertices[3].y; + + // draws boxes to image + if (draw) + for (int j = 0; j < 4; ++j) + line(frame, vertices[j], vertices[(j + 1) % 4], Scalar(0, 255, 0), 1); + } + + if (draw) { + // Draws efficiency information to the image. + std::vector layersTimes; + double freq = getTickFrequency() / 1000; + double t = net.getPerfProfile(layersTimes) / freq; + std::string label = format("Inference time: %.2f ms", t); + putText(frame, label, Point(0, 15), + FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0)); + } + + // prepare output + XPtrMat out = cvmat_xptr(frame); + out.attr("indices") = Rcpp::DataFrame::create( + Rcpp::_["x1"] = x1vec, + Rcpp::_["y1"] = y1vec, + Rcpp::_["x2"] = x2vec, + Rcpp::_["y2"] = y2vec, + Rcpp::_["x3"] = x3vec, + Rcpp::_["y3"] = y3vec, + Rcpp::_["x4"] = x4vec, + Rcpp::_["y4"] = y4vec + ); + + return out; +#endif + }