Re[5]: Как распарсить https://apnews.com/
От: kov_serg Россия  
Дата: 06.11.24 12:00
Оценка: 3 (1)
Здравствуйте, Passerby, Вы писали:

P>Здравствуйте, kov_serg, Вы писали:

_>>Это защита от ботов. Она привязана к ip и user-agent и доказывает прохождении капчи. Для начала можно просто из браузера скопировать, ну а потом:
P>Понятно. Много дней потратили на изучение получения таких куков? Я умножу и получу результат для себя.

А вы ленивы
<?php require "simple_html_dom.php"; // https://sourceforge.net/projects/simplehtmldom/files/simplehtmldom/1.9.1

//$url='https://apnews.com/politics';
$url='https://apnews.com/sports';

$user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36';

$cfg_fn='config.json';
if (!file_exists($cfg_fn)) { // https://github.com/Xewdy444/CF-Clearance-Scraper
    exec("python3 main.py https://apnews.com/ -v -d -f config.json --user-agent='$user_agent'");
}

$cfg=json_decode(file_get_contents($cfg_fn));
$user_agent=$cfg->clearance_cookies[0]->user_agent;
$cf_clearance=$cfg->clearance_cookies[0]->cf_clearance;

$ctx=stream_context_create(['http'=>[
    'method'=>"GET",
    'header'=>"User-Agent: $user_agent\r\n" .
              "Cookie: cf_clearance=$cf_clearance"
]]);

$html=file_get_html($url,false,$ctx);
if (!$html) { 
    unlink($cfg_fn);
    die("fail\n"); 
}

$i=0;foreach($html->find('.PagePromo-title') as $title) {
    $news_text=trim($title->plaintext);
    $news_url='';$link=$title->find('a',0); if ($link) $news_url=$link->href;
    //printf("%2d. <a href='%s'>%s</a><br/>\n",++$i,$news_url,htmlspecialchars($news_text));
    printf("%2d. %s\n",++$i,$news_text);
}

  main.py
from __future__ import annotations

import argparse
import asyncio
import io
import json
import logging
import sys
from datetime import datetime, timedelta, timezone
from enum import Enum
from typing import Any, Iterable, List, Optional

import nest_asyncio
import nodriver
from nodriver import cdp
from nodriver.cdp.network import Cookie
from nodriver.core.element import Element
from nodriver.core.tab import Tab
from selenium_authenticated_proxy import SeleniumAuthenticatedProxy


class PrintLocker:
    """A class for locking and unlocking the print function."""

    def __enter__(self) -> None:
        self.unlock()

    def __exit__(self, *_: Any) -> None:
        self.lock()

    @staticmethod
    def lock() -> None:
        """Lock the print function."""
        sys.stdout = io.StringIO()

    @staticmethod
    def unlock() -> None:
        """Unlock the print function."""
        sys.stdout = sys.__stdout__


class NodriverOptions(list):
    """A class for managing nodriver options."""

    def add_argument(self, arg: str) -> None:
        """
        Add an argument to the list of arguments.

        Parameters
        ----------
        arg : str
            The argument
        """
        self.append(arg)


class ChallengePlatform(Enum):
    """Cloudflare challenge platform types."""

    JAVASCRIPT = "non-interactive"
    MANAGED = "managed"
    INTERACTIVE = "interactive"


class CloudflareSolver:
    """
    A class for solving Cloudflare challenges with undetected-chromedriver.

    Parameters
    ----------
    timeout : float
        The timeout in seconds to use for browser actions and solving challenges.
    http2 : bool
        Enable or disable the usage of HTTP/2 for the browser requests.
    headless : bool
        Enable or disable headless mode for the browser.
    proxy : Optional[str]
        The proxy server URL to use for the browser requests.
    """

    def __init__(
        self,
        *,
        timeout: float,
        http2: bool,
        headless: bool,
        proxy: Optional[str],
    ) -> None:
        options = NodriverOptions()

        if not http2:
            options.add_argument("--disable-http2")

        if headless:
            options.add_argument("--headless=new")

        if proxy is not None:
            auth_proxy = SeleniumAuthenticatedProxy(proxy, use_legacy_extension=True)
            auth_proxy.enrich_chrome_options(options)

        config = nodriver.Config(browser_args=options)
        self.driver = nodriver.Browser(config)
        self._timeout = timeout

    async def __aenter__(self) -> CloudflareSolver:
        await self.driver.start()
        return self

    async def __aexit__(self, *_: Any) -> None:
        self.driver.stop()

    @staticmethod
    def set_user_agent(tab: Tab, user_agent: str) -> None:
        """
        Set the user agent for the browser tab.

        Parameters
        ----------
        tab : Tab
            The browser tab.
        user_agent : str
            The user agent string.
        """
        tab.feed_cdp(cdp.emulation.set_user_agent_override(user_agent))

    @staticmethod
    def extract_clearance_cookie(
        cookies: Iterable[Cookie],
    ) -> Optional[Cookie]:
        """
        Extract the Cloudflare clearance cookie from a list of cookies.

        Parameters
        ----------
        cookies : Iterable[Cookie]
            List of cookies.

        Returns
        -------
        Optional[Cookie]
            The Cloudflare clearance cookie. Returns None if the cookie is not found.
        """

        for cookie in cookies:
            if cookie.name == "cf_clearance":
                return cookie

        return None

    async def get_cookies(self) -> List[Cookie]:
        """
        Get all cookies from the current page.

        Returns
        -------
        List[Cookie]
            List of cookies.
        """
        return await self.driver.cookies.get_all()

    async def detect_challenge(self) -> Optional[ChallengePlatform]:
        """
        Detect the Cloudflare challenge platform on the current page.

        Returns
        -------
        Optional[ChallengePlatform]
            The Cloudflare challenge platform.
        """
        html: str = await self.driver.main_tab.get_content()

        for platform in ChallengePlatform:
            if f"cType: '{platform.value}'" in html:
                return platform

        return None

    async def solve_challenge(self) -> None:
        """Solve the Cloudflare challenge on the current page."""
        start_timestamp = datetime.now()

        while (
            self.extract_clearance_cookie(await self.get_cookies()) is None
            and await self.detect_challenge() is not None
            and (datetime.now() - start_timestamp).seconds < self._timeout
        ):
            widget_input = await self.driver.main_tab.find("input")

            if widget_input.parent is None or not widget_input.parent.shadow_roots:
                await asyncio.sleep(0.25)
                continue

            challenge = Element(
                widget_input.parent.shadow_roots[0],
                self.driver.main_tab,
                widget_input.parent.tree,
            )

            challenge = challenge.children[0]

            if (
                isinstance(challenge, Element)
                and "display: none;" not in challenge.attrs["style"]
            ):
                await asyncio.sleep(1)

                try:
                    await challenge.get_position()
                except Exception:
                    continue

                await challenge.mouse_click()


async def main() -> None:
    parser = argparse.ArgumentParser(
        description="A simple program for scraping Cloudflare clearance (cf_clearance) cookies from websites issuing Cloudflare challenges to visitors"
    )

    parser.add_argument(
        "url",
        metavar="URL",
        help="The URL to scrape the Cloudflare clearance cookie from",
        type=str,
    )

    parser.add_argument(
        "-f",
        "--file",
        default=None,
        help="The file to write the Cloudflare clearance cookie information to, in JSON format",
        type=str,
    )

    parser.add_argument(
        "-t",
        "--timeout",
        default=30,
        help="The timeout in seconds to use for solving challenges",
        type=float,
    )

    parser.add_argument(
        "-p",
        "--proxy",
        default=None,
        help="The proxy server URL to use for the browser requests",
        type=str,
    )

    parser.add_argument(
        "-ua",
        "--user-agent",
        default="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
        help="The user agent to use for the browser requests",
        type=str,
    )

    parser.add_argument(
        "--disable-http2",
        action="store_true",
        help="Disable the usage of HTTP/2 for the browser requests",
    )

    parser.add_argument(
        "-d",
        "--debug",
        action="store_true",
        help="Run the browser in headed mode",
    )

    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="Increase the output verbosity",
    )

    args = parser.parse_args()
    nest_asyncio.apply()

    print_locker = PrintLocker()
    print_locker.lock()

    logging_level = logging.INFO if args.verbose else logging.ERROR

    logging.basicConfig(
        format="[%(asctime)s] [%(levelname)s] %(message)s",
        datefmt="%H:%M:%S",
        level=logging_level,
    )

    logging.getLogger("nodriver").setLevel(logging.WARNING)
    logging.info("Launching %s browser...", "headed" if args.debug else "headless")

    challenge_messages = {
        ChallengePlatform.JAVASCRIPT: "Solving Cloudflare challenge [JavaScript]...",
        ChallengePlatform.MANAGED: "Solving Cloudflare challenge [Managed]...",
        ChallengePlatform.INTERACTIVE: "Solving Cloudflare challenge [Interactive]...",
    }

    async with CloudflareSolver(
        timeout=args.timeout,
        http2=not args.disable_http2,
        headless=not args.debug,
        proxy=args.proxy,
    ) as solver:
        solver.set_user_agent(solver.driver.main_tab, args.user_agent)

        # fix.begin
        ua,_=await solver.driver.main_tab.send(cdp.runtime.evaluate(expression="navigator.userAgent"))
        if args.user_agent!=ua.value:
            args.user_agent=ua.value
            logging.info("user_agent=%s",ua.value)
        # fix.end

        await solver.driver.main_tab.reload()
        logging.info("Going to %s...", args.url)

        try:
            await solver.driver.get(args.url)
        except asyncio.TimeoutError as err:
            logging.error(err)
            return

        clearance_cookie = solver.extract_clearance_cookie(await solver.get_cookies())

        if clearance_cookie is not None:
            logging.info("Cookie: cf_clearance=%s", clearance_cookie.value)
            logging.info("User agent: %s", args.user_agent)

            if not args.verbose:
                with print_locker:
                    print(f"cf_clearance={clearance_cookie.value}")

            return

        challenge_platform = await solver.detect_challenge()

        if challenge_platform is None:
            logging.error("No Cloudflare challenge detected.")
            return

        logging.info(challenge_messages[challenge_platform])

        try:
            await solver.solve_challenge()
        except asyncio.TimeoutError:
            pass

        clearance_cookie = solver.extract_clearance_cookie(await solver.get_cookies())

    if clearance_cookie is None:
        logging.error("Failed to retrieve a Cloudflare clearance cookie.")
        return

    logging.info("Cookie: cf_clearance=%s", clearance_cookie.value)
    logging.info("User agent: %s", args.user_agent)

    if not args.verbose:
        with print_locker:
            print(f"cf_clearance={clearance_cookie.value}")

    if args.file is None:
        return

    logging.info("Writing Cloudflare clearance cookie information to %s...", args.file)

    try:
        with open(args.file, encoding="utf-8") as file:
            json_data = json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        json_data = {"clearance_cookies": []}

    local_timezone = datetime.now(timezone.utc).astimezone().tzinfo
    unix_timestamp = clearance_cookie.expires - timedelta(days=365).total_seconds()
    timestamp = datetime.fromtimestamp(unix_timestamp, tz=local_timezone).isoformat()

    json_data["clearance_cookies"].append(
        {
            "unix_timestamp": int(unix_timestamp),
            "timestamp": timestamp,
            "domain": clearance_cookie.domain,
            "cf_clearance": clearance_cookie.value,
            "user_agent": args.user_agent,
            "proxy": args.proxy,
        }
    )

    with open(args.file, "w", encoding="utf-8") as file:
        json.dump(json_data, file, indent=4)


if __name__ == "__main__":
    asyncio.run(main())
  output
[16:27:56] [INFO] Launching headed browser...
[16:27:56] [INFO] Going to https://apnews.com/...
[16:27:57] [INFO] Solving Cloudflare challenge [Managed]...
[16:28:00] [INFO] Cookie: cf_clearance=GB1G2gzWCizNO_BrRHcHi_Y1tvA3SXlayBABwc6fzuM-1730899677-1.2.1.1-0feVnS_aJeeUko7dqVUKdw.8AekwrxvYhS5shs3tj2ecGdvb8HEVZVLg4euN2CpJJw5TxxITnb7El89dMeVWg7YylOC_bAcg3Byo1_CfCXOa_zeo2LKuh1hPMe8IFGoqLDfQm5HshstBd67q6spGO.9hN6RfD9jMXswhoD9yZjKhIgbfunaKuGnJzv2gBk_LtDg9N9dFWFI6SlowCLUDsenyxKVdZgIQjZUsr_.f7TKj7CxX2uR8PBCHre2GIgF8P63HChfNJ5wqKiM4MzIWYHWFnBFtRq3Joe.4IkCj6r7F7nsUErH518kOU8wrZQ_OBny3Pa6E.9icq8J.Ta7p8kGbp7prgawJ.Cdp4jV.wgpXNEyUP0l3xXk8gHL0OuV3b_Lg5_vPO5FagD.3ZohRtBsMMN8u0ZF1ekW3zWLA09g
[16:28:00] [INFO] User agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36
[16:28:00] [INFO] Writing Cloudflare clearance cookie information to config.json...
[16:28:01] [WARNING] Loop <_UnixSelectorEventLoop running=False closed=True debug=False> that handles pid 51860 is closed
 1. Oregon gets top billing in College Football Playoff’s opening rankings, Ohio St 2nd and Georgia 3rd
 2. Dodgers star Shohei Ohtani has surgery to repair labrum tear in shoulder after World Series injury
 3. NFL trade deadline: Commanders acquire Lattimore; Lions get Za’Darius Smith; Steelers add Williams
 4. 76ers’ Joel Embiid is suspended by the NBA for three games for shoving a newspaper columnist
 5. Oklahoma State coach Mike Gundy apologizes after lashing out at critics
 6. Fans celebrate as Los Angeles Dodgers win World Series
 7. Michael Jordan speaks after NASCAR court hearing
 8. Japan celebrates Shohei Ohtani as Dodgers win World Series
 9. U.S. Nordic combined salvages program with grant from International Ski and Snowboard Federation
10. Real Madrid, Manchester City both humiliated in Champions League, Liverpool enjoys Alonso’s return
11. Jerry Jones says Dak Prescott is likely headed to IR, but owner isn’t giving up on Cowboys’ season
12. The Commanders acquire 4-time Pro Bowl cornerback Marshon Lattimore from the Saints
13. Clemson coach Dabo Swinney challenged at poll when out to vote in election
14. Which move is better down 1 late? Kick the extra point or go for 2
15. Bills release safety Mike Edwards after failing in a bid to trade him before NFL deadline
16. Ravens QB Lamar Jackson misses practice again, but his coach says he’ll play against Cincinnati
17. Bowles says the Bucs need to do little things better to pull out of midseason tailspin
18. Florida finds its footing under coach Billy Napier. It should be enough to earn him a 4th year
19. From rankings, to 4-wheelers, to tortillas, Deion Sanders had plenty on his mind after the bye week
20. South Carolina off to another late surge after convincing win over Top 10 Texas A&amp;M
21. Preseason Big 12 top-five teams Utah, Oklahoma St., Kansas and Arizona at bottom of league standings
22. Spurs forward Sochan having surgery to repair fractured left thumb suffered in loss to Clippers
23. Hornets lose center Nick Richards for at least two weeks with rib cartilage fracture
24. Nuggets forward Aaron Gordon expected to miss multiple weeks with right calf strain, AP sources say
25. Clippers overcome 26-point deficit for their 1st victory at Intuit Dome
26. Celebrini returns from injury and Wennberg scores in OT to lift Sharks over Blue Jackets 2-1
27. Quinn Hughes and Brock Boeser score big in Canucks’ 5-1 rout of Ducks
28. Lehkonen scores go-ahead goal in return from injury as Avalanche beat Kraken 6-3
29. Lewis scores twice, Kings roll to 5-1 win over Wild
30. Led by Mark Sears, No. 2 Alabama is perhaps SEC’s best hope to end national title drought
31. Sixth-ranked Gonzaga opens with 101-63 victory over No. 8 Baylor
32. Unranked Ohio State leads wire-to-wire in 80-72 victory over No. 19 Texas
33. Love scores 17 and No. 10 Arizona opens season with 93-64 win over Canisius
34. South Carolina forward Ashlyn Watkins has charges against her dismissed
35. No. 1 South Carolina avoids major upset in 68-62 win over Michigan
36. No. 5 UCLA women overcome slow start to beat No. 17 Louisville 66-59 in Paris
37. No. 3 USC hits two late free throws to beat Ole Miss 68-66 in Paris
38. OMG hits Cooperstown as Mets’ home run sign will get Hall of Fame display
39. Braves stars Ronald Acuña Jr. and Spencer Strider not expected back from injuries by 2025 opener
40. Yankees GM Brian Cashman says he’s talked with agent Scott Boras about Juan Soto and Pete Alonso
41. Cashman sounds as if he intends to bring back Boone as manager, defends Yankees from Kelly criticism
42. Ancelotti ‘worried’ after another poor performance by Real Madrid
43. Arne Slot humbles Xabi Alonso as Liverpool routs Bayer Leverkusen 4-0 in the Champions League
44. Liverpool forward Luis Diaz scores second-half hat trick in 4-0 rout of Bayer Leverkusen
45. Amorim heading into Man United job on back of huge win over Man City
46. Judge rules Benjamin Mendy entitled to majority of his claim against former club Man City
47. McIlroy looks to clinch Race to Dubai title with new swing after 3 weeks shut away in a studio
48. Sauber signs Brazilian driver Gabriel Bortoleto to partner Nico Hülkenberg in F1 for 2025
49. South Dakota Coyotes play the Texas A&amp;M-Commerce Lions in non-conference action
50. Le Moyne and CSU Northridge set for cross-conference contest
51. No. 17 Indiana takes on SIU-Edwardsville for cross-conference game
52. Jacksonville State visits Air Force after Taylor’s 30-point outing
53. White leads North Dakota State against Illinois State after 22-point game
54. Central Michigan Chippewas host the Stony Brook Seawolves in non-conference action
55. Alcorn State Braves head to the Utah State Aggies
56. Jacksonville visits No. 21 Florida following Clayton’s 29-point game
57. West Georgia set for road matchup with the Georgia Tech Yellow Jackets
58. Olowoniyi leads Southern Indiana against Bucknell after 23-point outing
59. Cato leads Central Arkansas against Utah after 21-point game
60. Idaho State visits USC for non-conference showdown
61. Southern Miss visits UAB after Lendeborg’s 22-point game
62. Idaho hosts UC Davis following Johnson’s 35-point game
63. LSU hosts UL Monroe in out-of-conference matchup
64. Villanova takes on Columbia after Poplar’s 20-point showing
65. Florida A&amp;M visits SMU after Miller’s 21-point game
66. Southern faces Iowa after Dioumassi’s 30-point game
67. High Point hosts Coppin State for cross-conference contest
68. Chattanooga heads to Saint Mary’s (CA) for non-conference matchup
69. Boise State takes on Oakland in non-conference matchup
70. Pennsylvania hosts Maryland-Eastern Shore after Shaw’s 23-point game
71. No. 15 Creighton Bluejays to host UT Rio Grande Valley Vaqueros Wednesday
72. North Florida hosts Charleston Southern in cross-conference matchup
73. No. 25 Rutgers Scarlet Knights open season at home against the Wagner Seahawks
74. Northern Iowa Panthers to take on the Milwaukee Panthers Thursday
75. Stetson Hatters to host the Omaha Mavericks on Thursday
Отредактировано 06.11.2024 13:28 kov_serg . Предыдущая версия . Еще …
Отредактировано 06.11.2024 13:19 kov_serg . Предыдущая версия .
 
Подождите ...
Wait...
Пока на собственное сообщение не было ответов, его можно удалить.