知识库 → 用于创建站点地图文件 sitemap.xml 的脚本

[脚本]

出版日期： 21.10.2023

为了通知搜索引擎新页面的存在并传达当前站点结构，您需要创建一个站点地图，即一个包含所有内部链接的 sitemap.xml 文件。有多种创建方法；您可以使用外部服务或 CMS 的其他插件。每种方法都有其优点和缺点。

在此示例中，我们将使用单独的 PHP 脚本创建站点地图文件。我们在互联网上找到了这个脚本，并在文件头中留下了其作者的链接。您可以访问他的页面并从那里下载。在这里，我们以文件的形式呈现它，因为我们对其进行了较小的编辑，以便结果符合我们的预期。

1. 创建脚本文件夹

mkdir sitemap-gen

1.1 让我们创建 3 个具有以下名称的文件

cd sitemap-gen
touch sitemap.php
touch sitemap.functions.php
touch sitemap.config.php

sitemap.php

创建一个新的 sitemap.php 文件，其中包含以下内容：

  <?php

/***************************\
|***DO NOT EDIT THIS FILE***|
|**EDIT sitemap.config.php**|
\***************************/

error_reporting(E_ALL);

//Read global variables from config file
require_once( 'sitemap.config.php' );

// Include all functions
require_once('sitemap.functions.php');

//Default html header makes browsers ignore \n
header("Content-Type: text/plain");

$color = false;

$version_script = 2;

if ($version_script != $version_functions || $version_functions != $version_config){
	logger("Script versions mismatch!",3);
	logger("Update necessary",3);
	logger("Version of sitemap.functions.php " .$version_functions ,3);
	logger("Version of sitemap.config.php " .$version_config ,3);
	logger("Version of sitemap.php " .$version_script ,3);
	logger("Download new files here: https://www.github.com/knyzorg/sitemap-generator-crawler" ,3);
	die("Stopped.");
}

// Add PHP CLI support
if (php_sapi_name() === 'cli' && PHP_OS != 'WINNT') {
    parse_str(implode('&', array_slice($argv, 1)), $args);
    $color = true;
}

//Allow variable overloading with CLI
if (isset($args['file'])) {
    $file = $args['file'];
}
if (isset($args['site'])) {
    $site = $args['site'];
}
if (isset($args['max_depth'])) {
    $max_depth = $args['max_depth'];
}
if (isset($args['enable_frequency'])) {
    $enable_frequency = $args['enable_frequency'];
}
if (isset($args['enable_priority'])) {
    $enable_priority = $args['enable_priority'];
}
if (isset($args['enable_modified'])) {
    $enable_modified = $args['enable_modified'];
}
if (isset($args['freq'])) {
    $freq = $args['freq'];
}
if (isset($args['priority'])) {
    $priority = $args['priority'];
}
if (isset($args['blacklist'])) {
    $blacklist = $args['blacklist'];
}
if (isset($args['debug'])) {
    $debug = $args['debug'];
}
if (isset($args['ignore_arguments'])) {
    $ignore_arguments = !!$args['ignore_arguments'];
}
if (isset($args['pdf_index'])) {
    $pdf_index = $args['pdf_index'];
}

//Begin stopwatch for statistics
$start = microtime(true);

//Setup file stream
$tempfile = tempnam(sys_get_temp_dir(), 'sitemap.xml.');
$file_stream = fopen($tempfile, "w") or die("Error: Could not create temporary file $tempfile" . "\n");

fwrite($file_stream, $xmlheader . "\n");

// Global variable, non-user defined
$depth = 0;
$indexed = 0;
$scanned = array();
$deferredLinks = array();

// Reduce domain to root in case of monkey
$real_site = domain_root($site);

if ($real_site != $site){
    logger("Reformatted site from $site to $real_site", 2);
}

// Begin by crawling the original url
scan_url($real_site);

// Finalize sitemap
fwrite($file_stream, "</urlset>\n");
fclose($file_stream);

// Pretty-print sitemap
 if ((PHP_OS == 'WINNT') ? `where xmllint` : `which xmllint`) {
    logger("Found xmllint, pretty-printing sitemap", 0);
    $responsevalue = exec('xmllint --format ' . $tempfile . ' -o ' . $tempfile . ' 2>&1', $discardedoutputvalue, $returnvalue);
    if ($returnvalue) {
        die("Error: " . $responsevalue . "\n");
    }
}

// Generate and print out statistics
$time_elapsed_secs = round(microtime(true) - $start, 2);
logger("Sitemap has been generated in " . $time_elapsed_secs . " second" . (($time_elapsed_secs >= 1 ? 's' : '') . "and saved to $file"), 0);
$size = sizeof($scanned);
logger("Scanned a total of $size pages and indexed $indexed pages.", 0);

// Rename partial file to the real file name. `rename()` overwrites any existing files
rename($tempfile, $file);

// Apply permissions
chmod($file, $permissions);

// Declare that the script has finished executing and exit
logger("Operation Completed", 0);

sitemap.functions.php

创建一个新的 sitemap.functions.php 文件，其中包含以下内容：

  <?php

// Abstracted function to output formatted logging
function logger($message, $type)
{
    global $debug, $color;
    if ($color) {
        switch ($type) {
            case 0:
                //add
                echo $debug["add"] ? "\033[0;32m [+] $message \033[0m\n" : "";
                break;
            case 1:
                //reject
                echo $debug["reject"] ? "\033[0;31m [-] $message \033[0m\n" : "";
                break;
            case 2:
                //manipulate
                echo $debug["warn"] ? "\033[1;33m [!] $message \033[0m\n" : "";
                break;
            case 3:
                //critical
                echo "\033[1;33m [!] $message \033[0m\n";
                break;
        }
        return;
    }
    switch ($type) {
        case 0:
            //add
            echo $debug["add"] ? "[+] $message\n" : "";
            break;
        case 1:
            //reject
            echo $debug["reject"] ? "31m [-] $message\n" : "";
            break;
        case 2:
            //manipulate
            echo $debug["warn"] ? "[!] $message\n" : "";
            break;
        case 3:
            //critical
            echo "[!] $message\n";
            break;
    }
}

function flatten_url($url)
{
    global $real_site;
    $path = explode($real_site, $url)[1];
    return $real_site . remove_dot_seg($path);
}

/**
 * Remove dot segments from a URI path according to RFC3986 Section 5.2.4
 *
 * @param $path
 * @return string
 * @link http://www.ietf.org/rfc/rfc3986.txt
 */
function remove_dot_seg($path)
{
    if (strpos($path, '.') === false) {
        return $path;
    }

    $inputBuffer = $path;
    $outputStack = [];

    /**
     * 2.  While the input buffer is not empty, loop as follows:
     */
    while ($inputBuffer != '') {
        /**
         * A.  If the input buffer begins with a prefix of "../" or "./",
         *     then remove that prefix from the input buffer; otherwise,
         */
        if (strpos($inputBuffer, "./") === 0) {
            $inputBuffer = substr($inputBuffer, 2);
            continue;
        }
        if (strpos($inputBuffer, "../") === 0) {
            $inputBuffer = substr($inputBuffer, 3);
            continue;
        }

        /**
         * B.  if the input buffer begins with a prefix of "/./" or "/.",
         *     where "." is a complete path segment, then replace that
         *     prefix with "/" in the input buffer; otherwise,
         */
        if ($inputBuffer === "/.") {
            $outputStack[] = '/';
            break;
        }
        if (substr($inputBuffer, 0, 3) === "/./") {
            $inputBuffer = substr($inputBuffer, 2);
            continue;
        }

        /**
         * C.  if the input buffer begins with a prefix of "/../" or "/..",
         *     where ".." is a complete path segment, then replace that
         *     prefix with "/" in the input buffer and remove the last
         *     segment and its preceding "/" (if any) from the output
         *     buffer; otherwise,
         */
        if ($inputBuffer === "/..") {
            array_pop($outputStack);
            $outputStack[] = '/';
            break;
        }
        if (substr($inputBuffer, 0, 4) === "/../") {
            array_pop($outputStack);
            $inputBuffer = substr($inputBuffer, 3);
            continue;
        }

        /**
         * D.  if the input buffer consists only of "." or "..", then remove
         *     that from the input buffer; otherwise,
         */
        if ($inputBuffer === '.' || $inputBuffer === '..') {
            break;
        }

        /**
         * E.  move the first path segment in the input buffer to the end of
         *     the output buffer, including the initial "/" character (if
         *     any) and any subsequent characters up to, but not including,
         *     the next "/" character or the end of the input buffer.
         */
        if (($slashPos = stripos($inputBuffer, '/', 1)) === false) {
            $outputStack[] = $inputBuffer;
            break;
        } else {
            $outputStack[] = substr($inputBuffer, 0, $slashPos);
            $inputBuffer = substr($inputBuffer, $slashPos);
        }
    }

    return ltrim(implode($outputStack), "/");
}

// Check if a URL has already been scanned
function is_scanned($url)
{
    global $scanned;

    if (isset($scanned[$url])) {
        return true;
    }

    //Check if in array as dir and non-dir
    $url = ends_with($url, "/") ? substr($url, 0, -1) : $url . "/";
    if (isset($scanned[$url])) {
        return true;
    }

    return false;
}

function ends_with($haystack, $needle)
{
    $length = strlen($needle);
    if ($length == 0) {
        return true;
    }
    return (substr($haystack, -$length) === $needle);
}

// Gets path for a relative linl
// https://somewebsite.com/directory/file => https://somewebsite.com/directory/
// https://somewebsite.com/directory/subdir/ => https://somewebsite.com/directory/subdir/
function get_path($path)
{
    $path_depth = explode("/", $path);
    $len = strlen($path_depth[count($path_depth) - 1]);
    return (substr($path, 0, strlen($path) - $len));
}

//Get the root of the domain
function domain_root($href)
{
    $url_parts = explode('/', $href);
    return $url_parts[0] . '//' . $url_parts[2] . '/';
}

//The curl client is create outside of the function to avoid re-creating it for performance reasons
$curl_client = curl_init();
function get_data($url)
{
    global $curl_validate_certificate, $curl_client, $index_pdf, $crawler_user_agent, $enable_modified;

    //Set URL
    curl_setopt($curl_client, CURLOPT_URL, $url);
    //Follow redirects and get new url
    curl_setopt($curl_client, CURLOPT_RETURNTRANSFER, 1);
    //Get headers
    curl_setopt($curl_client, CURLOPT_HEADER, 1);
    //Optionally avoid validating SSL
    curl_setopt($curl_client, CURLOPT_SSL_VERIFYPEER, $curl_validate_certificate);
    //Set user agent
    curl_setopt($curl_client, CURLOPT_USERAGENT, $crawler_user_agent);

    //Get data
    $data = curl_exec($curl_client);
    $content_type = curl_getinfo($curl_client, CURLINFO_CONTENT_TYPE);
    $http_code = curl_getinfo($curl_client, CURLINFO_HTTP_CODE);
    $redirect_url = curl_getinfo($curl_client, CURLINFO_REDIRECT_URL);

    //Scan new url, if redirect
    if ($redirect_url) {
        logger("URL is a redirect.", 1);
        if (strpos($redirect_url, '?') !== false) {
            $redirect_url = explode($redirect_url, "?")[0];
        }
        unset($url, $data);

        if (!check_blacklist($redirect_url)) {
            echo logger("Redirected URL is in blacklist", 1);

        } else {
            scan_url($redirect_url);
        }
    }

    //If content acceptable, return it. If not, `false`
    $html = ($http_code != 200 || (!stripos($content_type, "html"))) ? false : $data;

    //Additional data
    if ($enable_modified){
	$timestamp = curl_getinfo($curl_client, CURLINFO_FILETIME);
        $modified = date('c', strtotime($timestamp));
//        curl_setopt($curl_client, CURLOPT_FILETIME, true);
//        $timestamp = curl_getinfo($curl_client, CURLINFO_FILETIME);
//        $modified = ($timestamp != -1) ? date('c', $timestamp) : null;
    }
    else {

	$modified = null;

    }

    if (stripos($content_type, "application/pdf") !== false && $index_pdf) {
        $html = "This is a PDF";
    }
    //Return it as an array
    return array($html, $modified, (stripos($content_type, "image/") && $index_img));
}

//Try to match string against blacklist
function check_blacklist($string)
{
    global $blacklist;
    if (is_array($blacklist)) {
        foreach ($blacklist as $illegal) {
            if (fnmatch($illegal, $string)) {
                return false;
            }
        }
    }
    return true;
}

//Extract array of URLs from html document inside of `href`s
function get_links($html, $parent_url, $regexp)
{
    if (preg_match_all("/$regexp/siU", $html, $matches)) {
        if ($matches[2]) {
            $found = array_map(function ($href) use (&$parent_url) {
                global $real_site, $ignore_arguments;

                logger("Checking $href", 2);

                if (strpos($href, "#") !== false) {
                    logger("Dropping pound.", 2);
                    $href = preg_replace('/\#.*/', '', $href);
                }

                //Seperate $href from $query_string
                $query_string = '';
                if (strpos($href, '?') !== false) {
                    list($href, $query_string) = explode('?', $href);

                    //Parse &amp to not break curl client. See issue #23
                    $query_string = str_replace('&', '&', $query_string);
                }
                if ($ignore_arguments) {
                    $query_string = '';
                }
                if (strpos($href, '?') !== false) {
                    echo "EFEASDEFSED";
                }

                if ((substr($href, 0, 7) != "http://") && (substr($href, 0, 8) != "https://")) {
                    // Link does not call (potentially) external page
                    if (strpos($href, ":")) {
                        logger("URL is an invalid protocol", 1);
                        return false;
                    }
                    if ($href == '/') {
                        logger("$href is domain root", 2);
                        $href = $real_site;
                    } elseif (substr($href, 0, 1) == '/') {
                        logger("$href is relative to root, convert to absolute", 2);
                        $href = domain_root($real_site) . substr($href, 1);
                    } else {
                        logger("$href is relative, convert to absolute", 2);
                        $href = get_path($parent_url) . $href;
                    }
                }
                logger("Result: $href", 2);
                if (!filter_var($href, FILTER_VALIDATE_URL)) {
                    logger("URL is not valid. Rejecting.", 1);
                    return false;
                }
                if (substr($href, 0, strlen($real_site)) != $real_site) {
                    logger("URL is not part of the target domain. Rejecting.", 1);
                    return false;
                }
                if (is_scanned($href . ($query_string ? '?' . $query_string : ''))) {
                    //logger("URL has already been scanned. Rejecting.", 1);
                    return false;
                }
                if (!check_blacklist($href)) {
                    logger("URL is blacklisted. Rejecting.", 1);
                    return false;
                }
                return flatten_url($href . ($query_string ? '?' . $query_string : ''));
            }, $matches[2]);
            return $found;
        }
    }
    logger("Found nothing", 2);
    return array();
}

function scan_url($url)
{
    global $scanned, $deferredLinks, $file_stream, $freq, $priority, $enable_priority, $enable_frequency, $max_depth, $depth, $real_site, $indexed;
    $depth++;

    logger("Scanning $url", 2);
    if (is_scanned($url)) {
        logger("URL has already been scanned. Rejecting.", 1);
        return $depth--;
    }
    if (substr($url, 0, strlen($real_site)) != $real_site) {
        logger("URL is not part of the target domain. Rejecting.", 1);
        return $depth--;
    }
    if (!($depth <= $max_depth || $max_depth == 0)) {
        logger("Maximum depth exceeded. Rejecting.", 1);
        return $depth--;
    }

    //Note that URL has been scanned
    $scanned[$url] = 1;

    //Send cURL request
    list($html, $modified, $is_image) = get_data($url);

    if ($is_image) {
        //Url is an image
    }

    if (!$html) {
        logger("Invalid Document. Rejecting.", 1);
        return $depth--;
    }

    if (strpos($url, "&") && strpos($url, ";") === false) {
        $url = str_replace("&", "&", $url);
    }

    $map_row = "	<url>\n";
    $map_row .= "		<loc>$url</loc>\n";
    if ($modified) {
        $map_row .= "		<lastmod>$modified</lastmod>\n";
    }
    if ($enable_frequency) {
        $map_row .= "		<changefreq>$freq</changefreq>\n";
    }
    if ($enable_priority) {
        $map_row .= "		<priority>$priority</priority>\n";
    }
    $map_row .= "	</url>\n";
    fwrite($file_stream, $map_row);
    $indexed++;
    logger("Added: " . $url . (($modified) ? " [Modified: " . $modified . "]" : ''), 0);
    unset($is_image, $map_row);

    // Extract urls from <a href="??"></a>
    $ahrefs = get_links($html, $url, "<a\s[^>]*href=(\"|'??)([^\" >]*?)\\1[^>]*>(.*)<\/a>");

    // Extract urls from <frame src="??">
    $framesrc = get_links($html, $url, "<frame\s[^>]*src=(\"|'??)([^\" >]*?)\\1[^>]*>");

    $links = array_filter(array_merge($ahrefs, $framesrc), function ($item) use (&$deferredLinks) {
        return $item && !isset($deferredLinks[$item]);
    });
    unset($html, $url, $ahrefs, $framesrc);

    logger("Found urls: " . join(", ", $links), 2);

    //Note that URL has been deferred
    foreach ($links as $href) {
        if ($href) {
            $deferredLinks[$href] = 1;
        }
    }

    foreach ($links as $href) {
        if ($href) {
            scan_url($href);
        }
    }
    $depth--;
}

// fnmatch() filler for non-POSIX systems

if (!function_exists('fnmatch')) {
    function fnmatch($pattern, $string)
    {
        return preg_match("#^" . strtr(preg_quote($pattern, '#'), array('\*' => '.*', '\?' => '.')) . "$#i", $string);
    } // end
} // end if

$version_functions = 2;

sitemap.config.php

使用以下内容创建一个新的 sitemap.config.php 文件:

<?php
/*
Sitemap Generator by Slava Knyazev. Further acknowledgements in the README.md file. 

Website: https://www.knyz.org/
I also live on GitHub: https://github.com/knyzorg
Contact me: Slava@KNYZ.org
*/

//Make sure to use the latest revision by downloading from github: https://github.com/knyzorg/Sitemap-Generator-Crawler

/* Usage
Usage is pretty strait forward:
- Configure the crawler by editing this file.
- Select the file to which the sitemap will be saved
- Select URL to crawl
- Configure blacklists, accepts the use of wildcards (example: http://example.com/private/* and *.jpg)
- Generate sitemap
- Either send a GET request to this script or run it from the command line (refer to README file)
- Submit to Google
- Setup a CRON Job execute this script every so often

It is recommended you don't remove the above for future reference.
*/
//date_default_timezone_set('Europe/Moscow');
date_default_timezone_set('Etc/GMT-1');
// Default site to crawl
$site = "https://synay.net";

// Default sitemap filename
$file = "sitemap.xml";
$permissions = 0644;

// Depth of the crawl, 0 is unlimited
$max_depth = 0;

// Show changefreq
$enable_frequency = true;

// Show priority
$enable_priority = true;

// Default values for changefreq and priority
$freq = "daily";
$priority = "1.00";

// Add lastmod based on server response. Unreliable and disabled by default.
$enable_modified = true;

// Disable this for misconfigured, but tolerable SSL server.
$curl_validate_certificate = true;

// The pages will be excluded from crawl and sitemap.
// Use for exluding non-html files to increase performance and save bandwidth.
$blacklist = array(
    "*.jpg",
    "*/manager/*",
    "https://synay.net/manager"
);

// Enable this if your site do requires GET arguments to function
$ignore_arguments = false;

// Not yet implemented. See issue #19 for more information.
$index_img = false;

//Index PDFs
$index_pdf = true;

// Set the user agent for crawler
$crawler_user_agent = "Mozilla/5.0 (compatible; Sitemap Generator Crawler; +https://synay.net";

// Header of the sitemap.xml
$xmlheader ='<?xml version="1.0" encoding="UTF-8"?>
<!-- Sitemap file generated for https://synay.net at '. date("D M j G:i:s T Y") . ' -->
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
       xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
                           http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">';

// Optionally configure debug options
$debug = array(
    "add" => true,
    "reject" => false,
    "warn" => false
);


//Modify only if configuration version is broken
$version_config = 2;

将文件的内容复制到以相同名称创建的文件中。

2. 运行脚本

在 sitemap-gen 文件夹的根目录中，指定您的域而不是 domain.tld 以及保存 sitemap.xml 文件的路径

php sitemap.php file=/home/user/sitemap-gen/sitemap.xml site=https://domain.tld

在此示例中，我们使用 php 8.1 和 php-xml 模块

该脚本将绕过您网站上的所有内部链接，并在准备好后通知您已完成，并指示所花费的时间。

在创建过程中，您还将看到它当前正在抓取哪些页面。

[+] Added: https://synay.net/ [Modified: 2023-10-21T17:35:20+01:00] 
[+] Added: https://synay.net/hosting [Modified: 2023-10-21T17:35:21+01:00] 
[+] Added: https://synay.net/vps [Modified: 2023-10-21T17:35:21+01:00]

创建一个包含 5000 个页面的站点地图大约需要 45 分钟

如果您愿意，可以将此任务添加到 cron 中，以便脚本定期更新 sitemap.xml 文件中的数据

No Comments Yet