Skip to content
This repository has been archived by the owner on Nov 25, 2023. It is now read-only.

Commit

Permalink
add mime content type crawling #1
Browse files Browse the repository at this point in the history
  • Loading branch information
ghost committed May 6, 2023
1 parent 0bd95d7 commit 702a14b
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 70 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ GET m=SphinxQL
* [x] Auto stop crawling on disk quota reached
* [x] Transactions support to prevent data loss on queue failures
* [x] Distributed index crawling between YGGo nodes trough manifest API
* [ ] MIME Content-type crawler settings
* [x] MIME Content-type crawler settings
* [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing
* [ ] Palette image index / filter
Expand Down
97 changes: 50 additions & 47 deletions crontab/crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -236,40 +236,39 @@
continue;
}

// Save image content on data settings enabled
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
// Skip image processing on MIME type not provided
if (!$hostImageContentType = $curl->getContentType()) {

// Skip image processing on MIME type not provided
if (!$contentType = $curl->getContentType()) {

continue;
}

// Skip image processing on MIME type not allowed in settings
if (false === strpos($contentType, CRAWL_IMAGE_MIME_TYPE)) {
continue;
}

continue;
}
// Skip image processing on MIME type not allowed in settings
if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) {

// Skip image processing without returned content
if (!$content = $curl->getContent()) {
continue;
}

continue;
}
// Skip image processing without returned content
if (!$content = $curl->getContent()) {

// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
continue;
}

continue;
}
// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {

if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
continue;
}

continue;
}
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {

$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time());
continue;
}

$hostImagesIndexed += $db->updateHostImage($hostImage->hostImageId,
Filter::mime($hostImageContentType),
(!CRAWL_HOST_DEFAULT_META_ONLY ? 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64 : null),
time());
}

// Process pages crawl queue
Expand Down Expand Up @@ -344,38 +343,39 @@
}
}

// Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) {

continue;
}

// Skip page links following by robots:nofollow attribute detected
if (false !== stripos($metaRobots, 'nofollow')) {

continue;
}

// Update queued page data
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
Filter::mime($contentType),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));

// Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {

$metaYggoManifestCRC32 = crc32($metaYggoManifest);

if (!$db->getManifest($metaYggoManifestCRC32)) {
$db->addManifest($metaYggoManifestCRC32,
$db->addManifest($metaYggoManifestCRC32,
$metaYggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
}
}

// Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) {

continue;
}

// Skip page links following by robots:nofollow attribute detected
if (false !== stripos($metaRobots, 'nofollow')) {

continue;
}

// Collect page images
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {

Expand All @@ -402,7 +402,7 @@

$imageSrc = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
}

Expand Down Expand Up @@ -466,16 +466,19 @@
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));

// Save image info
// Save new image info
$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));

if (!$hostImageId && // image not exists
$hostStatus && // host enabled
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
$hostStatus && // host enabled
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit

// Add host image
if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time(), null, 200)) {
if ($hostImageId = $db->addHostImage($hostId,
crc32($hostImageURI->string),
$hostImageURI->string,
time())) {

$hostImagesAdded++;

Expand Down
Binary file modified database/yggo.mwb
Binary file not shown.
7 changes: 7 additions & 0 deletions library/filter.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ static public function url(mixed $url) {
return trim(urldecode($url));
}

static public function mime(mixed $mime) {

$mime = (string) $mime;

return trim($mime);
}

static public function pageTitle(mixed $title) {

$title = (string) $title;
Expand Down
41 changes: 24 additions & 17 deletions library/mysql.php
Original file line number Diff line number Diff line change
Expand Up @@ -180,24 +180,26 @@ public function getHostImagesByLimit(int $hostId, int $limit) {
}

public function addHostImage(int $hostId,
int $crc32uri,
string $uri,
int $timeAdded,
mixed $timeUpdated = null,
mixed $httpCode = null,
mixed $rank = null,
mixed $data = null) {
int $crc32uri,
string $uri,
int $timeAdded,
mixed $timeUpdated = null,
mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null,
mixed $data = null) {

$query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`,
`crc32uri`,
`uri`,
`timeAdded`,
`timeUpdated`,
`httpCode`,
`mime`,
`rank`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)');
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');

$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $data]);
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $data]);

return $this->_db->lastInsertId();
}
Expand All @@ -224,13 +226,14 @@ public function updateHostImageHttpCode(int $hostImageId,
return $query->rowCount();
}

public function updateHostImageData(int $hostImageId,
string $data,
int $timeUpdated) {
public function updateHostImage(int $hostImageId,
string $mime,
mixed $data,
int $timeUpdated) {

$query = $this->_db->prepare('UPDATE `hostImage` SET `data` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `data` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');

$query->execute([$data, $timeUpdated, $hostImageId]);
$query->execute([$mime, $data, $timeUpdated, $hostImageId]);

return $query->rowCount();
}
Expand Down Expand Up @@ -439,6 +442,7 @@ public function addHostPage(int $hostId,
int $timeAdded,
mixed $timeUpdated = null,
mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null,
mixed $metaTitle = null,
mixed $metaDescription = null,
Expand All @@ -451,13 +455,14 @@ public function addHostPage(int $hostId,
`timeAdded`,
`timeUpdated`,
`httpCode`,
`mime`,
`rank`,
`metaTitle`,
`metaDescription`,
`metaKeywords`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');

$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);

return $this->_db->lastInsertId();
}
Expand All @@ -466,14 +471,16 @@ public function updateHostPage( int $hostPageId,
mixed $metaTitle,
mixed $metaDescription,
mixed $metaKeywords,
string $mime,
mixed $data) {

$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
`metaDescription` = ?,
`metaKeywords` = ?,
`mime` = ?,
`data` = ? WHERE `hostPageId` = ? LIMIT 1');

$query->execute([$metaTitle, $metaDescription, $metaKeywords, $data, $hostPageId]);
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $hostPageId]);

return $query->rowCount();
}
Expand Down
10 changes: 5 additions & 5 deletions public/search.php
Original file line number Diff line number Diff line change
Expand Up @@ -353,17 +353,17 @@
$db->updateHostImageHttpCode($hostImage->hostImageId, (int) $hostImageHttpCode, time());

if (200 != $hostImageHttpCode) continue;
if (!$hostImageContentType = $hostImageCurl->getContentType()) continue;
if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) continue;

// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageType = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue;
if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue;
if (!$hostImageBase64 = @base64_encode($hostImageCurl->getContent())) continue;

$hostImageURLencoded = 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64;
$hostImageURLencoded = 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64;

// Save image content on data settings enabled
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
$db->updateHostImageData($hostImage->hostImageId, (string) $hostImageURLencoded, time());
}
$db->updateHostImage($hostImage->hostImageId, Filter::mime($hostImageContentType), (!CRAWL_HOST_DEFAULT_META_ONLY ? $hostImageURLencoded : null), time());

// Local image data exists
} else {
Expand Down

0 comments on commit 702a14b

Please sign in to comment.