indieweb-utils/src/indieweb_utils/posts/discovery.py at 80f47dd6e46314b40dc8be44bbecd03b06060fff · capjamesg/indieweb-utils · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
import re
from typing import List, Tuple
from urllib import parse as url_parse

import mf2py
import requests
from bs4 import BeautifulSoup

from ..utils.urls import _is_http_url, canonicalize_url

# This regex identifies permashortlink citations in the form of (example.com slug)
# Permashortlink citations may be used as a link to a post that does not contain a hyperlink
# Checking for a permashortlink citation is a step in the Original Post Discovery algorithm
# More on permashortlink citations: https://indieweb.org/permashortcitation
PERMASHORTLINK_CITATION_BRACKET_MATCHING = r"\((.*?)\)"


class PostDiscoveryError(Exception):
    pass


def _process_candidate_url(candidate_url: str, posse_permalink: str, parsed_post: BeautifulSoup) -> str:
    try:
        request = requests.get(candidate_url, timeout=5)
    except requests.exceptions.RequestException:
        raise PostDiscoveryError("Could not get candidate url")

    parsed_candidate_url = BeautifulSoup(request.text, "lxml")

    all_hyperlinks = parsed_candidate_url.select("a")

    posse_domain = url_parse.urlsplit(posse_permalink).netloc

    for link in all_hyperlinks:
        if "u-syndication" in link.get("class"):
            url_to_check = link.get("href")

            original_post_url = _syndication_check(url_to_check, posse_permalink, candidate_url, posse_domain)

            if original_post_url:
                return original_post_url

    all_syndication_link_headers = parsed_post.select("link[rel='syndication']")

    for header in all_syndication_link_headers:
        if header.get("href") == posse_permalink:
            url_to_check = header.get("href")

            original_post_url = _syndication_check(url_to_check, posse_permalink, candidate_url, posse_domain)

            if original_post_url:
                return original_post_url

    return ""


def _check_for_link_in_post(last_text: BeautifulSoup) -> str:
    last_text = last_text[0].select("p")[-1]

    # if permashortlink citation
    # format = (url.com id)

    permashortlink_citation = re.search(PERMASHORTLINK_CITATION_BRACKET_MATCHING, last_text.text)

    if permashortlink_citation is not None:
        permashortlink = re.search(PERMASHORTLINK_CITATION_BRACKET_MATCHING, last_text.text)

    if permashortlink is not None:
        permashortlink_value = "http://" + permashortlink.group(0) + "/" + permashortlink.group(1)

        candidate_url = permashortlink_value
    else:
        # check for url at end
        split_text = last_text.text.split(" ")

        if _is_http_url(split_text[-1]):
            candidate_url = split_text[-1]
        else:
            candidate_url = ""

    return candidate_url


def discover_original_post(posse_permalink: str) -> str:
    """
    Find the original version of a post per the Original Post Discovery algorithm.

    refs: https://indieweb.org/original-post-discovery#Algorithm

    :param posse_permalink: The permalink of the post.
    :type posse_permalink: str
    :return: The original post permalink.
    :rtype: str
    """
    parsed_post = BeautifulSoup(posse_permalink, "lxml")

    # Get the post h-entry

    post_h_entry = parsed_post.select(".h-entry")

    original_post_url = None

    if not post_h_entry:
        raise PostDiscoveryError("Could not find h-entry")

    post_h_entry = post_h_entry[0]

    # select with u-url and u-uid
    if post_h_entry.select(".u-url .u-uid"):
        original_post_url = post_h_entry.select(".u-url .u-uid")[0].get("href")
        return original_post_url

    canonical_links = parsed_post.select("link[rel='canonical']")

    if canonical_links:
        original_post_url = canonical_links[0].get("href")
        return original_post_url

    # look for text with see original anchor text

    for link in parsed_post.select("a"):
        if link.text.lower() == "see original".lower() and link.get("href"):
            original_post_url = link.get("href")

            return original_post_url

    candidate_url = None

    last_text = post_h_entry.select(".e-content")

    if last_text:
        candidate_url = _check_for_link_in_post(last_text)

    if candidate_url and candidate_url != "":
        post_url = _process_candidate_url(candidate_url, posse_permalink, parsed_post)

        if post_url != "":
            return post_url

    return ""


def _discover_h_card_from_author_page(author_url: str, rel_author: str) -> dict:
    new_h_card = mf2py.parse(url=author_url)

    # get rel me values from parsed object
    if new_h_card.get("rels") and new_h_card.get("rels").get("me"):
        rel_mes = new_h_card["rels"]["me"]
    else:
        rel_mes = []

    final_h_card = [e for e in new_h_card["items"] if e["type"] == "h-card"]

    for card in final_h_card:
        for j in card["items"]:
            if (
                j.get("type")
                and j.get("type") == ["h-card"]
                and j["properties"]["url"] == rel_author
                and j["properties"].get("uid") == j["properties"]["url"]
            ):
                h_card = j
                return h_card

            if j.get("type") and j.get("type") == ["h-card"] and j["properties"].get("url") in rel_mes:
                h_card = j
                return h_card

            if j.get("type") and j.get("type") == ["h-card"] and j["properties"]["url"] == rel_author:
                h_card = j
                return h_card

    return {}


def discover_author(url: str, page_contents: str = "") -> dict:
    """
    Discover the author of a post per the IndieWeb Authorship specification.

    :param url: The URL of the post.
    :type url: str
    :param page_contents: The optional page contents to use.
        Specifying this value prevents a HTTP request being made to the URL.
    :type page_contents: str
    :return: A h-card of the post.
    :rtype: dict

    .. code-block:: python

        import indieweb_utils
        import mf2py

        url = "https://jamesg.blog/2022/01/28/integrated-indieweb-services/"

        parsed_mf2 = mf2py.parse(url=url)

        post_author = indieweb_utils.discover_author(
            h_entry
        )

        print(post_author) # A h-card object representing the post author.
    """
    if page_contents != "":
        full_page = mf2py.parse(doc=page_contents)
    else:
        full_page = mf2py.parse(url=url)

    preliminary_author = None

    h_entry = [e for e in full_page["items"] if e["type"] == ["h-entry"]]

    if h_entry and h_entry[0]["properties"].get("author"):
        preliminary_author = h_entry[0]["properties"]["author"][0]

    h_feed = [e for e in full_page["items"] if e["type"] == ["h-feed"]]

    if h_feed and h_feed[0]["properties"].get("author"):
        preliminary_author = h_entry[0]["properties"]["author"][0]

    author_page_url = None

    if preliminary_author and type(preliminary_author) == str:
        if preliminary_author.startswith("https://"):
            # author is url, further processing needed
            author_page_url = preliminary_author
        else:
            # author is name
            return {
                "type": ["h-card"],
                "properties": {
                    "name": [preliminary_author],
                    "url": [url],
                },
            }

    if preliminary_author and type(preliminary_author) == dict:
        # author is h-card so the value can be returned
        return preliminary_author

    # if rel=author, look for h-card on the rel=author link
    if author_page_url is None and h_entry and h_entry[0].get("rels") and h_entry[0]["rels"].get("author"):
        rel_author = h_entry[0]["rels"]["author"]

        if rel_author:
            author_page_url = rel_author[0]

    # canonicalize author page
    if author_page_url:
        domain = url_parse.urlsplit(url).netloc

        author_url = canonicalize_url(author_page_url, domain)

        h_card = _discover_h_card_from_author_page(author_url, rel_author)

        return h_card

    return {}


def get_post_type(h_entry: dict, custom_properties: List[Tuple[str, str]] = []) -> str:
    """
    Return the type of a h-entry per the Post Type Discovery algorithm.

    :param h_entry: The h-entry whose type to retrieve.
    :type h_entry: dict
    :param custom_properties: The optional custom properties to use for the Post Type Discovery algorithm.
    :type custom_properties: list[tuple[str, str]]
    :return: The type of the h-entry.
    :rtype: str

    Here is an example of the function in action:

    .. code-block:: python

        import indieweb_utils
        import mf2py

        url = "https://jamesg.blog/2022/01/28/integrated-indieweb-services/"

        parsed_mf2 = mf2py.parse(url=url)

        h_entry = [e for e in parsed_mf2["items"] if e["type"] == ["h-entry"]][0]

        post_type = indieweb_utils.get_post_type(
            h_entry
        )

        print(post_type) # article
    """
    post = h_entry.get("properties")

    if post is None:
        return "unknown"

    values_to_check = [
        ("rsvp", "rsvp"),
        ("in-reply-to", "reply"),
        ("repost-of", "repost"),
        ("like-of", "like"),
        ("video", "video"),
        ("photo", "photo"),
        ("summary", "summary"),
    ]

    for prop in custom_properties:
        if len(prop) == 2 and isinstance(prop, tuple) and isinstance(prop[0], str) and isinstance(prop[1], str):
            values_to_check.append(prop)
        else:
            raise Exception("custom_properties must be a list of tuples")

    for item in values_to_check:
        if post.get(item[0]):
            return item[1]

    post_type = "note"

    if post.get("name") is None or post.get("name")[0] == "":
        return post_type

    title = post.get("name")[0].strip().replace("\n", " ").replace("\r", " ")

    content = post.get("content")

    if content and content[0].get("text") and content[0].get("text")[0] != "":
        content = BeautifulSoup(content[0].get("text"), "lxml").get_text()

    if content and content[0].get("html") and content[0].get("html")[0] != "":
        content = BeautifulSoup(content[0].get("html"), "lxml").get_text()

    if not content.startswith(title):
        return "article"

    return "note"


def _syndication_check(url_to_check, posse_permalink, candidate_url, posse_domain):
    if url_to_check == posse_permalink:
        return candidate_url

    if url_to_check and url_parse.urlsplit(url_to_check).netloc == posse_domain:
        try:
            r = requests.get(url_to_check, timeout=10, allow_redirects=True)
        except requests.exceptions.RequestException:
            # handler will prevent exception due to timeout, if one occurs
            pass

        for url_item in r.history:
            if url_item.url == posse_permalink:
                return candidate_url

    return None