最新消息:20210917 已从crifan.com换到crifan.org

【已解决】Python上传印象笔记的笔记的图片资源到WordPress后且更新印象笔记的笔记

图片 crifan 371浏览 0评论
折腾:
【未解决】Python处理发布印象笔记帖子到WordPress后的部分细节优化
期间,之前已实现印象笔记Evernote的note笔记的图片资源resource,缩小处理后,上传到WordPress的media:
【已解决】Python中更新印象笔记中帖子中附件图片的数据
后,回来更新Evernote的note的resources的list
但是前后调试了几次,发现都是有些bug:
更新后,note的resources和实际已处理的image的个数不匹配
导致更新后,resources中还残留没处理的图片资源,但是content的html中已经没了en-media了。
所以,此处再去继续修复逻辑。
最后采用的是:
给noteDetail.resources拷贝了一份latestResList,然后作为参数
    logging.info("Uploading resource and sync url")
    if noteDetail.resources:
        # later process should use copied resources list, otherwise internal will alter resources list, case unexpected resources list changed
        originResList = copy.deepcopy(noteDetail.resources)
        latestResList = copy.deepcopy(noteDetail.resources)
        totalResNum = len(originResList)
        logging.info("Total resources: %d", totalResNum)
        for curResIdx, eachResource in enumerate(originResList):
            curResNum = curResIdx + 1
            logging.info("%s resource %d/%d %s", "-"*20, curResNum, totalResNum, "-"*20)
            uploadedImgUrl = gEnToWp.uploadNoteImageToWordpress(noteDetail, eachResource, latestResList)
            # 'https://www.crifan.com/files/pic/uploads/2020/11/acd6a38382b0475db72764187cac7ae0.jpg'
            logging.info("uploaded resource %s to wordpress and has sync url %s", eachResource.guid, uploadedImgUrl)
            # if uploadedImgUrl:
            #     latestResList.remove(eachResource)
                # Error: 发生异常: ValueError list.remove(x): x not in list
                # for inside uploadNoteImageToWordpress have removed
传入uploadNoteImageToWordpress
    def uploadNoteImageToWordpress(self, curNoteDetail, curResource, curResList=None):
        """Upload note single imges to wordpress, and sync to note (replace en-media to img) 


        Args:
            curNote (Note): evernote Note
            curResource (Resource): evernote Note Resource
            curResList (list): evernote Note Resource list
        Returns:
            upload image url(str)
        Raises:
        """
        if not curResList:
            curResList = curNoteDetail.resources


        uploadedImgUrl = ""


        isImg = self.evernote.isImageResource(curResource)
        if not isImg:
            logging.warning("Not upload resource %s to wordpress for Not Image", curResource)
            return uploadedImgUrl


        isUploadOk, respInfo = self.uploadImageToWordpress(curResource)
        if isUploadOk:
            # {'id': 70491, 'url': 'https://www.crifan.com/files/pic/uploads/2020/11/c8b16cafe6484131943d80267d390485.jpg', 'slug': 'c8b16cafe6484131943d80267d390485', 'link': 'https://www.crifan.com/c8b16cafe6484131943d80267d390485/', 'title': 'c8b16cafe6484131943d80267d390485'}
            uploadedImgUrl = respInfo["url"]
            logging.info("uploaded url %s", uploadedImgUrl)
            # "https://www.crifan.com/files/pic/uploads/2020/03/f6956c30ef0b475fa2b99c2f49622e35.png"
            # relace en-media to img
            respNote = self.syncNoteImage(curNoteDetail, curResource, uploadedImgUrl, curResList)
            # logging.info("Complete sync image %s to note %s", uploadedImgUrl, respNote.title)
        else:
            logging.warning("Failed to upload image resource %s to wordpress", curResource)


        return uploadedImgUrl
再传入:syncNoteImage
    def syncNoteImage(self, curNoteDetail, curResource, uploadedImgUrl, curResList=None):
        """Sync uploaded image url into Evernote Note content, replace en-media to img


        Args:
            curNoteDetail (Note): evernote Note
            curResource (Resource): evernote Note Resource
            uploadedImgUrl (str): uploaded imge url, previously is Evernote Resource
            curResList (list): evernote Note Resource list
        Returns:
            updated note detail
        Raises:
        """
        if not curResList:
            curResList = curNoteDetail.resources


        # curContent = curNoteDetail.content
        # logging.debug("curContent=%s", curContent)
        # soup = BeautifulSoup(curContent, 'html.parser')
        soup = crifanEvernote.noteContentToSoup(curNoteDetail)


        """
            <en-media hash="7c54d8d29cccfcfe2b48dd9f952b715b" type="image/png" />
        """
        # imgeTypeP = re.compile("image/\w+")
        # mediaNodeList = soup.find_all("en-media", attrs={"type": imgeTypeP})
        # mediaNodeList = soup.find("en-media", attrs={"hash": })
        curEnMediaSoup = crifanEvernote.findResourceSoup(soup, curResource)
        logging.debug("curEnMediaSoup=%s", curEnMediaSoup)
        # curEnMediaSoup=<en-media hash="0bbf1712d4e9afe725dd51e701c7fae6" style="width: 788px; height: auto;" type="image/jpeg"></en-media>


        if curEnMediaSoup:
            curImgSoup = curEnMediaSoup
            curImgSoup.name = "img"
            curImgSoup.attrs = {"src": uploadedImgUrl}
            logging.debug("curImgSoup=%s", curImgSoup)
            # curImgSoup=<img src="https://www.crifan.com/files/pic/uploads/2020/11/c8b16cafe6484131943d80267d390485.jpg"></img>
            # new content string
            updatedContent = crifanEvernote.soupToNoteContent(soup)
            logging.debug("updatedContent=%s", updatedContent)
            curNoteDetail.content = updatedContent
        else:
            logging.warning("Not found en-media node for guid=%s, mime=%s, fileName=%s", curResource.guid, curResource.mime, curResource.attributes.fileName)
            # here even not found, still consider as processed, later will remove it


        # remove resource from resource list
        # oldResList = curNoteDetail.resources
        # Note: avoid side-effect: alter pass in curNoteDetail object's resources list
        # which will cause caller curNoteDetail.resources loop terminated earlier than expected !
        # oldResList = copy.deepcopy(curNoteDetail.resources)
        # oldResList.remove(curResource) # workable
        # newResList = oldResList
        # Note 20201206: has update above loop, so should directly update curNoteDetail.resources
        # curNoteDetail.resources.remove(curResource)
        # newResList = curNoteDetail.resources


        curResList.remove(curResource)
        newResList = curResList


        syncParamDict = {
            # mandatory
            "noteGuid": curNoteDetail.guid,
            "noteTitle": curNoteDetail.title,
            # optional
            "newContent": curNoteDetail.content,
            "newResList": newResList,
        }
        respNote = self.evernote.syncNote(**syncParamDict)
        logging.info("Complete sync image %s to evernote note %s", uploadedImgUrl, curNoteDetail.title)


        return respNote
如此,内部处理时,只依赖于传入的resource的list去最终remove已处理的
这样就不会影响,和受原有的,当前的note的resources了
最终,实现了:
确保每次上传了一个图片resource后,把当前的resources的list中删除掉
以及再去用最新的resources,(调用evernote.syncNote)及时更新note,确保同步。
附录,相关被调用到的函数:
libs/crifan/crifanEvernote.py
    @staticmethod
    def findResourceSoup(soup, curResource):
        """find related BeautifulSoup soup from Evernote Resource


        Args:
            soup (Soup): BeautifulSoup soup
            curResource (Resource): Evernote Resource
        Returns:
            soup node
        Raises:
        """
        curMime = curResource.mime # 'image/png'
        logging.debug("curMime=%s", curMime)
        # # method 1: calc again
        # curResBytes = curResource.data.body
        # curHashStr1 = utils.calcMd5(curResBytes) # 'dc355da030cafe976d816e99a32b6f51'


        # method 2: convert from body hash bytes
        curHashStr = utils.bytesToStr(curResource.data.bodyHash)
        logging.debug("curHashStr=%s", curHashStr)
        # b'\xae\xe1G\xdb\xcdh\x16\xca+@IF"\xff\xfa\xa3' -> 'aee147dbcd6816ca2b40494622fffaa3'


        # imgeTypeP = re.compile("image/\w+")
        curResSoup = soup.find("en-media", attrs={"type": curMime, "hash": curHashStr})
        logging.debug("curResSoup=%s", curResSoup)
        # <en-media hash="aee147dbcd6816ca2b40494622fffaa3" type="image/png" width="370"></en-media>
        return curResSoup
注:当之前调试期间出现不匹配时,此处curResSoup就可能未空
而正常情况下,肯定不为空的,是有值的。
libs/crifan/utils.py
def bytesToStr(inputBytes, encoding="UTF-8"):
    """convert binary bytes into str hexadecimal representation


    Args:
        inputBytes (bytes): bytes
    Returns:
        str
    Examples:
        input: b'\xdc5]\xa00\xca\xfe\x97m\x81n\x99\xa3+oQ'
        return: 'dc355da030cafe976d816e99a32b6f51'
    Raises:
    """
    inputHex = binascii.hexlify(inputBytes) # b'dc355da030cafe976d816e99a32b6f51'
    inputStr = inputHex.decode(encoding) # 'dc355da030cafe976d816e99a32b6f51'
    return inputStr
另外更新到最后的
libs/crifan/crifanEvernote.py
    @staticmethod
    def soupToNoteContent(soup):
        """Convert BeautifulSoup Soup to Evernote Note content


        Args:
            soup (Soup): BeautifulSoup Soup
        Returns:
            Evernote Note content html(str)
        Raises:
        """


        # for debug
        # if soup.name != "html":
        if soup.name != "[document]":
            logging.info("soup.name=%s", soup.name)


        # soup.name = "en-note" # not work
        noteContentHtml = utils.soupToHtml(soup, isFormat=False)
        # Note: here not use formated html, to avoid
        # speical case:
        # some special part title is url, then format will split part url and title
        # so here not use format


        # convert <html>...</html> back to <en-note>...</en-note>
        noteContentHtml = re.sub('<html>(?P<contentBody>.+)</html>', "<en-note>\g<contentBody></en-note>", noteContentHtml, flags=re.S)


        noteContentHtml = crifanEvernote.convertToClosedEnMediaTag(noteContentHtml)


        # add first line
        # <!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">
        noteContentHtml = '<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">\n' + noteContentHtml


        return noteContentHtml
libs/crifan/utils.py
def soupToHtml(soup, isFormat=True):
    """Convert soup to html string


    Args:
        soup (Soup): BeautifulSoup soup
        isFormat (bool): use prettify to format html
    Returns:
        html (str)
    Raises:
    """
    if isFormat:
        curHtml = soup.prettify()
    else:
        curHtml = str(soup)
    return curHtml
libs/crifan/crifanEvernote.py
    @staticmethod
    def convertToClosedEnMediaTag(noteHtml):
        """Process note content html, for special </en-media> will cause error, so need convert:
                <en-media hash="7c54d8d29cccfcfe2b48dd9f952b715b" type="image/png"></en-media>
            to closed en-media tag:
                <en-media hash="7c54d8d29cccfcfe2b48dd9f952b715b" type="image/png" />
        Args:
            noteHtml (str): Note content html
        Returns:
            note content html with closed en-media tag (str)
        Raises:
        """
        noteHtml = re.sub("(?P<enMedia><en-media\s+[^<>]+)>\s*</en-media>", "\g<enMedia> />", noteHtml, flags=re.S)
        return noteHtml
libs/crifan/crifanEvernote.py
    @staticmethod
    def noteContentToSoup(curNote):
        """Convert Evernote Note content to BeautifulSoup Soup


        Args:
            curNote (Note): Evernote Note
        Returns:
            Soup
        Raises:
        """
        noteHtml = crifanEvernote.getNoteContentHtml(curNote)


        soup = utils.htmlToSoup(noteHtml)
        # Note: now top node is <html>, not <en-note>
        #       but top node name is '[document]' not 'html'


        # for debug
        # if soup.name != "html":
        if soup.name != "[document]":
            logging.info("soup.name=%s", soup.name)


        return soup
以及最新的:
    @staticmethod
    def getNoteContentHtml(curNote, retailTopHtml=True):
        """Get evernote Note content html


        Args:
            curNote (Note): evernote Note
            retailTopHtml (bookl): Ture to <html>xxx<html>, False to xxx
        Returns:
            html (str)
        Raises:
        """
        noteHtml = curNote.content


        # Special:
        # '<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">\n<en-note>
        # -> remove: <?xml version="1.0" encoding="UTF-8" standalone="no"?>
        noteHtml = re.sub('<\?xml version="1.0" encoding="UTF-8" standalone="no"\?>\s*', "", noteHtml)


        # remove fisrt line
        # <!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">
        # '<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">\n
        noteHtml = re.sub('<!DOCTYPE en-note SYSTEM "http://xml\.evernote\.com/pub/enml2\.dtd">\s*', "", noteHtml)


        if retailTopHtml:
            # convert <en-note>...</en-note> to <html>...</html>
            replacedP = "<html>\g<contentBody></html>"
        else:
            # convert <en-note>...</en-note> to ...
            replacedP = "\g<contentBody>"
        noteHtml = re.sub('<en-note>(?P<contentBody>.+)</en-note>', replacedP, noteHtml, flags=re.S)


        noteHtml = noteHtml.strip()


        return noteHtml
上传image图片的resource:
libs/crifan/crifanEvernoteToWordpress.py
    def uploadImageToWordpress(self, imgResource):
        """Upload image resource to wordpress


        Args:
            imgResource (Resouce): evernote image Resouce
        Returns:
            (bool, dict)
        Raises:
        """
        imgData = imgResource.data
        imgBytes = imgData.body
        imgDataSize = imgData.size
        # guid:'f6956c30-ef0b-475f-a2b9-9c2f49622e35'
        imgGuid = imgResource.guid
        logging.debug("imgGuid=%s, imgDataSize=%s", imgGuid, imgDataSize)


        curImg = utils.bytesToImage(imgBytes)
        logging.debug("curImg=%s", curImg)


        # # for debug
        # curImg.show()


        imgFormat = curImg.format # 'PNG'
        imgSuffix = utils.ImageFormatToSuffix[imgFormat] # 'png'
        imgMime = utils.ImageSuffixToMime[imgSuffix] # 'image/png'
        # curDatetimeStr = utils.getCurDatetimeStr() # '20200307_173141'
        processedGuid = imgGuid.replace("-", "") # 'f6956c30ef0b475fa2b99c2f49622e35'
        # imgeFilename = "%s.%s" % (curDatetimeStr, imgSuffix) # '20200307_173141.png'
        imgeFilename = "%s.%s" % (processedGuid, imgSuffix) # 'f6956c30ef0b475fa2b99c2f49622e35.png'


        isUploadImgOk, respInfo = self.wordpress.createMedia(imgMime, imgeFilename, imgBytes)
        logging.info("%s to upload resource %s to wordpress", isUploadImgOk, imgGuid)
        return isUploadImgOk, respInfo
libs/crifan/crifanWordpress.py
class crifanWordpress(object):

    def __init__(self, host, jwtToken, requestsProxies=None):
。。。
        self.apiMedia = self.host + "/wp-json/wp/v2/media" # 'https://www.crifan.com/wp-json/wp/v2/media'

    def createMedia(self, contentType, filename, mediaBytes):
        """Create wordpress media (image)
            by call REST api: POST /wp-json/wp/v2/media


        Args:
            contentType (str): content type
            filename (str): attachment file name
            mediaBytes (bytes): media binary bytes
        Returns:
            (bool, dict)
                True, uploaded media info
                False, error detail
        Raises:
        """
        curHeaders = {
            "Authorization": self.authorization,
            "Content-Type": contentType,
            "Accept": "application/json",
            'Content-Disposition': 'attachment; filename=%s' % filename,
        }
        logging.debug("curHeaders=%s", curHeaders)
        # curHeaders={'Authorization': 'Bearer eyJ0xxxyyy.zzzB4', 'Content-Type': 'image/png', 'Content-Disposition': 'attachment; filename=f6956c30ef0b475fa2b99c2f49622e35.png'}
        createMediaUrl = self.apiMedia
        resp = requests.post(
            createMediaUrl,
            proxies=self.requestsProxies,
            headers=curHeaders,
            data=mediaBytes,
        )
        logging.debug("resp=%s", resp)


        isUploadOk, respInfo = crifanWordpress.processCommonResponse(resp)
        return isUploadOk, respInfo
相关完整的,最新的代码,详见:
【后记】
对于:
更新完毕所有图片后,resources是空列表,也已经更新到note了
但是后续get detail,仍然还有一个resource:
之前以为是自己代码bug呢。
结果再次调试发现,其实就是印象笔记的问题:
即使代码此处更新,resources是空,但是sync后,再去获取,仍然还残留一个image的resource,具体原因,不知道。
对了,再去调试看看,看看具体是哪个图片的resource被残留了。
从guid看看,或许可以找到?又或许是无故多出的?

转载请注明:在路上 » 【已解决】Python上传印象笔记的笔记的图片资源到WordPress后且更新印象笔记的笔记

发表我的评论
取消评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址
90 queries in 0.184 seconds, using 23.36MB memory