Comment extraire des données de source de la page HTML (un onglet à l'intérieur) d'une page Web?

Zac:

J'ai essayé plusieurs solutions spécifiques dans d' autres réponses, comme l' expérimentation avec les agents utilisateurs différents (Chrome, Safari , etc.), et d' obtenir HTML en utilisant directement HTTPClient et BufferedReader, mais aucun d'entre eux travaillent. Comment puis-je faire la sortie Android similaire en tant que sortie web? Voici la sortie web Je cherche; (Voir la source de la page https://finance.yahoo.com/quote/AAPL/financials?p=AAPL pour la sortie complète - il contient essentiellement l'onglet AJAX nommé « trimestriel » qui contient une table . Je dois obtenir ces données, mais la source Android HTML ne l' a pas , mais la source Web fait.)

root.App.main = {"context":{"dispatcher":{"stores":{"PageStore":{"currentPageName":"quote","currentRenderTargetId":"default","pagesConfigRaw":{"base":{"quote":{"layout":{"bundleName":"yahoodotcom-layout.TwoColumnLayout","name":"TwoColumnLayout","config":{"enableHeaderCollapse":true,"Header":{"isFixed":true,"uhContainerClasses":"Bgi($uhGrayGradient)","navContainerClasses":"Bgi($navrailGrayGradient) Bxsh($navrailShadow) Pos(r) hasScrolled_Bxsh(headerShadow) Panel-open_Bxsh(headerShadow)","navTransitionClasses":"HideNavrail_Translate3d(0,-46px,0) Panel-open_Translate3d(0,-46px,0)","secondaryNavContainerClasses":"hasScrolled_Bdbw(0px) Bxsh($navrailShadow)","height":135},"fetchNewAttribution":true},"meta":{"property":{"twitter:site":"@YahooFinance"}}},"meta":{"property":{"twitter:site":"@YahooFinance","fb:pages":"90376669494"}},"regions":{"SecondaryNav":[{"bundleName":"react-finance","name":"SecondaryNav","config":{"ui":{"enableRelativeUrl":true}},"props":{"key":"SecondaryNav-0-SecondaryNav","id":"SecondaryNav-0-SecondaryNav"},"isPageComposite":true}],"Overlay":[{"bundleName":"react-lightbox","name":"Lightbox","props":{"key":"Overlay-0-Lightbox","id":"Overlay-0-Lightbox"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-1-Null","id":"Overlay-1-Null"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-2-Null","id":"Overlay-2-Null"},"isPageComposite":true}],"Lead":[{"bundleName":"react-finance","name":"FinanceHeader","props":{"className":"Bxz(bb) H(100%) Pos(r) Maw($newGridWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mstart(a) Mend(a) Px(20px) My(10px)","showAds":true,"adsConfig":{"positions":["FB2A","FB2B","FB2C","FB2D"]},"key":"Lead-0-FinanceHeader","id":"Lead-0-FinanceHeader"},"isPageComposite":true},{"bundleName":"tdv2-applet-featurebar","name":"FeatureBar","config":{"ui":{"container_classnames":"W(100%) Bxz(bb) Bdrs(2px) Mb(10px) Maw($maxModuleWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mx(a)","prerender":{"enabled":true,"renderTargetId":"modal"}},"site":"finance"},"props":{"key":"Lead-1-FeatureBar","id":"Lead-1-FeatureBar"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteHeader","props":{"key":"Lead-2-QuoteHeader","id":"Lead-2-QuoteHeader"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteNav","props":{"key":"Lead-3-QuoteNav","id":"Lead-3-QuoteNav"},"isPageComposite":true}],"Col1":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LDRB","style":{"marginBottom":"8px","paddingTop":"0px","marginLeft":"auto","marginRight":"auto","textAlign":"center","lineHeight":"0px","position":"relative","zIndex":"5"},"key":"Col1-0-Ad","id":"Col1-0-Ad"},"isPageComposite":true},{"bundleName":"Quote.financials","name":"Financials","props":{"key":"Col1-1-Financials","id":"Col1-1-Financials"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-foot","positions":["FOOT"],"key":"Col1-2-AdUnitWithTdAds","id":"Col1-2-AdUnitWithTdAds"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-fsrvy","positions":["FSRVY"],"key":"Col1-3-AdUnitWithTdAds","id":"Col1-3-AdUnitWithTdAds"},"isPageComposite":true}],"Col2":[{"bundleName":"td-app-finance","name":"ExtPromoButton","props":{"className":"btn Bds(s) Bdc($c-fuji-grey-c) Bdrs(4px) Bgc($white) Bdw(1px) Bgc($ExtButtonHov):h C($white):h C($ExtButtonHov) Cur(p) Fz(s) Fw(b) H(44px) Lh(40px) Mb(20px) Ta(c) Td(n) W(100%)","sec":"ext-promo-all-mkt-submit","titleId":"EXTENSION_PROMO_TITLE","url":"https:\u002F\u002Fchrome.google.com\u002Fwebstore\u002Fdetail\u002Fdoojmkhhplhicnghmafjbhncmgjiohma","enabled":true,"key":"Col2-0-ExtPromoButton","id":"Col2-0-ExtPromoButton"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"eventPromo","key":"Col2-1-QuoteModule","id":"Col2-1-QuoteModule"},"isPageComposite":true},{"bundleName":"td-ads","name":"ComboAd","props":{"adparseStyle":{"marginBottom":"20px"},"finishedStyle":{"marginBottom":"20px"},"children":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LREC"}},{"bundleName":"td-ads","name":"Ad","props":{"pos":"MON"}}],"serverHeight":true,"key":"Col2-2-ComboAd","id":"Col2-2-ComboAd"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"similarCompanies","key":"Col2-3-QuoteModule","id":"Col2-3-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"earningsChart","key":"Col2-4-QuoteModule","id":"Col2-4-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"financialsChart","key":"Col2-5-QuoteModule","id":"Col2-5-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"react-finance",..."}}}};

Voici la sortie Android que je reçois;

(root.App.main = {"context":{"dispatcher":{"stores":{"PageStore":{"currentPageName":"quote","currentRenderTargetId":"default","pagesConfigRaw":{"base":{"quote":{"layout":{"bundleName":"yahoodotcom-layout.TwoColumnLayout","name":"TwoColumnLayout","config":{"enableHeaderCollapse":true,"Header":{"isFixed":true,"uhContainerClasses":"Bgi($uhGrayGradient)","navContainerClasses":"Bgi($navrailGrayGradient) Bxsh($navrailShadow) Pos(r) hasScrolled_Bxsh(headerShadow) Panel-open_Bxsh(headerShadow)","navTransitionClasses":"HideNavrail_Translate3d(0,-46px,0) Panel-open_Translate3d(0,-46px,0)","secondaryNavContainerClasses":"hasScrolled_Bdbw(0px) Bxsh($navrailShadow)","height":135},"fetchNewAttribution":true},"meta":{"property":{"twitter:site":"@YahooFinance"}}},"meta":{"property":{"twitter:site":"@YahooFinance","fb:pages":"90376669494"}},"regions":{"SecondaryNav":[{"bundleName":"react-finance","name":"SecondaryNav","config":{"ui":{"enableRelativeUrl":true}},"props":{"key":"SecondaryNav-0-SecondaryNav","id":"SecondaryNav-0-SecondaryNav"},"isPageComposite":true}],"Overlay":[{"bundleName":"react-lightbox","name":"Lightbox","props":{"key":"Overlay-0-Lightbox","id":"Overlay-0-Lightbox"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-1-Null","id":"Overlay-1-Null"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-2-Null","id":"Overlay-2-Null"},"isPageComposite":true}],"Lead":[{"bundleName":"react-finance","name":"FinanceHeader","props":{"className":"Bxz(bb) H(100%) Pos(r) Maw($newGridWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mstart(a) Mend(a) Px(20px) My(10px)","showAds":true,"adsConfig":{"positions":["FB2A","FB2B","FB2C","FB2D"]},"key":"Lead-0-FinanceHeader","id":"Lead-0-FinanceHeader"},"isPageComposite":true},{"bundleName":"tdv2-applet-featurebar","name":"FeatureBar","config":{"ui":{"container_classnames":"W(100%) Bxz(bb) Bdrs(2px) Mb(10px) Maw($maxModuleWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mx(a)","prerender":{"enabled":true,"renderTargetId":"modal"}},"site":"finance"},"props":{"key":"Lead-1-FeatureBar","id":"Lead-1-FeatureBar"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteHeader","props":{"key":"Lead-2-QuoteHeader","id":"Lead-2-QuoteHeader"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteNav","props":{"key":"Lead-3-QuoteNav","id":"Lead-3-QuoteNav"},"isPageComposite":true}],"Col1":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LDRB","style":{"marginBottom":"8px","paddingTop":"0px","marginLeft":"auto","marginRight":"auto","textAlign":"center","lineHeight":"0px","position":"relative","zIndex":"5"},"key":"Col1-0-Ad","id":"Col1-0-Ad"},"isPageComposite":true},{"bundleName":"Quote.financials","name":"Financials","props":{"key":"Col1-1-Financials","id":"Col1-1-Financials"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-foot","positions":["FOOT"],"key":"Col1-2-AdUnitWithTdAds","id":"Col1-2-AdUnitWithTdAds"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-fsrvy","positions":["FSRVY"],"key":"Col1-3-AdUnitWithTdAds","id":"Col1-3-AdUnitWithTdAds"},"isPageComposite":true}],"Col2":[{"bundleName":"td-app-finance","name":"ExtPromoButton","props":{"className":"btn Bds(s) Bdc($c-fuji-grey-c) Bdrs(4px) Bgc($white) Bdw(1px) Bgc($ExtButtonHov):h C($white):h C($ExtButtonHov) Cur(p) Fz(s) Fw(b) H(44px) Lh(40px) Mb(20px) Ta(c) Td(n) W(100%)","sec":"ext-promo-all-mkt-submit","titleId":"EXTENSION_PROMO_TITLE","url":"https:\u002F\u002Fchrome.google.com\u002Fwebstore\u002Fdetail\u002Fdoojmkhhplhicnghmafjbhncmgjiohma","enabled":true,"key":"Col2-0-ExtPromoButton","id":"Col2-0-ExtPromoButton"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"eventPromo","key":"Col2-1-QuoteModule","id":"Col2-1-QuoteModule"},"isPageComposite":true}

Avez-vous des suggestions? Merci. Mon code;

Document doc = Jsoup.connect(requestURL).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.43")
                .timeout(600000).get();
        Elements tableDivs = doc.getElementsByAttributeValue("class", myClassName);
        Elements scriptTags = doc.getElementsByTag("script");
        for (Element script : scriptTags) {
            //System.out.println(script.data());
            Log.e("ONE", script.data());
        }
tmadam:

Yahoo Finance redirige vers guce.oath.com, qui nous renseigne sur l'utilisation des cookies et autres données, et nécessite de cliquer sur « accepter » avant de fournir le contenu. Nous pouvons observer que, même dans un navigateur si nous dégageons les cokies et actualiser la page.

On pourrait gratter le lien de guce.oath.com, mais j'ai remarqué que l'URL finale a un guccounter=2paramètre, et si nous utilisons cette URL , nous pouvons obtenir la réponse requise.

String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL&guccounter=2";
String userAgent = "My UAString";
Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();

Étant donné que les données ne sont pas HTML mais le code JavaScript, nous ne pouvons pas l' analyser avec jsoup, mais nous pouvons utiliser des expressions régulières.

Elements scriptTags = doc.getElementsByTag("script");
String re = "root\\.App\\.main\\s*\\=\\s*(.*?);\\s*\\}\\(this\\)\\)\\s*;";
String data = null;

for (Element script : scriptTags) {
    Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
    Matcher matcher = pattern.matcher(script.html());

    if (matcher.find()) {
        data = matcher.group(1);
        break;
    }
}

La datachaîne doit contenir le dictionnaire à partir du code JavaScript, qui est une chaîne JSON valide qui peut être analysé avec JSONObject.


Sur Android studio cependant, il n'y a pas réoriente jusqu'à sa que je peux dire. Je l' ai essayé avec plusieurs chaînes d'agent utilisateur, mais il semble que la page se charge directement. Pourtant, le dictionnaire JavaScript qui contient les données sont présentes, et nous pouvons l' extraire et analyser avec JSONObject.

Code pour Android Studio:

String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL";
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.43";
String row = "totalRevenue";

try {
    Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();
    String html = doc.html();
    //Log.d("html", html);

    Elements scriptTags = doc.getElementsByTag("script");
    String re = "root\\.App\\.main\\s*\\=\\s*(.*?);\\s*\\}\\(this\\)\\)\\s*;";

    for (Element script : scriptTags) {
        Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
        Matcher matcher = pattern.matcher(script.html());

        if (matcher.find()) {
            String data = matcher.group(1);
            //Log.d("data", data);

            JSONObject jo = new JSONObject(data);
            JSONArray table = getTable(jo);
            //Log.d("table", table.toString());

            String[] tableRow = getRow(table, row);
            String values = TextUtils.join(", ", tableRow);
            Log.d("values", values);
        }
    }
} catch (Exception e) {
    Log.e("err", "err", e);
}

Cela devrait analyser les données et sélectionner les valeurs « Total des revenus ». Les getTableet getRowméthodes je:

private JSONArray getTable(JSONObject json) throws JSONException {
    JSONArray table = (JSONArray) json.getJSONObject("context")
            .getJSONObject("dispatcher")
            .getJSONObject("stores")
            .getJSONObject("QuoteSummaryStore")
            .getJSONObject("incomeStatementHistoryQuarterly")
            .getJSONArray("incomeStatementHistory");
    return table;
}

private String[] getRow(JSONArray table, String name) throws JSONException {
    String[] values = new String[table.length()];
    for (int i = 0; i < table.length(); i++) {
        JSONObject jo = table.getJSONObject(i);
        if (jo.has(name)) {
            jo = jo.getJSONObject(name);
            values[i] = jo.has("longFmt") ? jo.get("longFmt").toString() : "-";
        } else {
            values[i] = "-";
        }
    }
    return values;
}

private String[] getDates(JSONArray table) throws JSONException {
    String[] values = new String[table.length()];
    for (int i = 0; i < table.length(); i++) {
        values[i] = table.getJSONObject(i).getJSONObject("endDate")
                .get("fmt").toString();
    }
    return values;
}

Je pense que la meilleure façon d'obtenir les données de la table est de mapper chaque nom de la ligne de html à une touche JSON. En outre, la table principale a cinq sous-tables, afin que nous puissions associer chaque table imbriquée aux lignes qu'il contient.

Map<String, Map<String, String>> getTableNames() {
    final Map<String, String> revenue = new LinkedHashMap<String, String>() {
        { put("Total Revenue", "totalRevenue"); }
        { put("Cost of Revenue", "costOfRevenue"); }
        { put("Gross Profit", "grossProfit"); }
    };
    final Map<String, String> operatingExpenses = new LinkedHashMap<String, String>() {
        { put("Research Development", "researchDevelopment"); }
        { put("Selling General and Administrative", "sellingGeneralAdministrative"); }
        { put("Non Recurring", "nonRecurring"); }
        { put("Others", "otherOperatingExpenses"); }
        { put("Total Operating Expenses", "totalOperatingExpenses"); }
        { put("Operating Income or Loss", "operatingIncome"); }
    };
    Map<String, Map<String, String>> allTableNames = new LinkedHashMap<String, Map<String, String>>() {
        { put("Revenue", revenue); }
        { put("Operating Expenses", operatingExpenses); }

    };
    return allTableNames;
}

Nous pouvons utiliser cette carte pour sélectionner une seule cellule, par exemple, le « revenu total » de 30.06.2018 (qui est sur la première ligne et colonne),

JSONObject jo = new JSONObject(jsData);
JSONArray table = getTable(jo);

Map<String, Map<String, String>> tableNames = getTableNames();
String totalRevenueKey = tableNames.get("Revenue").get("Total Revenue");
String[] totalRevenueValues = getRow(table, totalRevenueKey);
String value = totalRevenueValues[0];

ou nous pourrions itérer sur les noms de table et construire une liste ou une chaîne, contenant toutes les données de la table.

List<String> tableData = new ArrayList<>();
Map<String, Map<String, String>> tableNames = getTableNames();
String[] dates = getDates(table);

for (Map.Entry<String, Map<String, String>> tableEntry : tableNames.entrySet()) {
    tableData.add(tableEntry.getKey());
    tableData.addAll(Arrays.asList(dates));

    for (Map.Entry<String, String> row : tableEntry.getValue().entrySet()) {
        String[] tableRow = getRow(table, row.getValue());
        tableData.add(row.getKey());
        for (String column: tableRow) {
            tableData.add(column);
        }
    }
}
String tableDataString = TextUtils.join(", ", tableData);

J'ai essayé de faire correspondre la table html, autant que possible, de sorte que la la tableDataliste et la chaîne résultante est formated comme « nom de la table, la date, la date, la date, la date » et « nom de la ligne, le prix, le prix, le prix, le prix » , mais il peut être préférable d'inclure uniquement les chiffres. (dans ce cas , il convient d' ajouter que les tableRowéléments à à tableData)

Je suppose que tu aimes

Origine http://43.154.161.224:23101/article/api/json?id=136853&siteId=1
conseillé
Classement