Skip to content

Scapen von Anhängen schlägt fehl in Witten #21

@the-infinity

Description

@the-infinity

Dort wird eine leicht andere Version der ASP-Linie eingesetzt, welche u.a. auch eine andere Downloader URL hat. Das Problem tritt bei allen PDFs auf.

Log sieht mit einigen Zusatzausgaben (u.a. das Resultat) so aus:

Getting submission 3192 from https://secure.stadt-witten.de/session/bis/vo0050.asp?__kvonr=3192
Getting attachment '0729_V_15_Vorlage'
<POST https://secure.stadt-witten.de/session/bis/ydocstart.asp application/x-www-form-urlencoded
  <HiddenControl(DO=812518775748617743470535354582284735550864428772745503172237060132754656) (readonly)>
  <HiddenControl(DT=0729_V_15_Vorlage) (readonly)>
  <HiddenControl(CLIENT=6) (readonly)>
  <HiddenControl(DEN=pdf) (readonly)>
  <HiddenControl(SF=) (readonly)>
  <HiddenControl(LO=) (readonly)>
  <HiddenControl(s1=775882716) (readonly)>
  <HiddenControl(s2=815257364) (readonly)>
  <HiddenControl(s3=587552575) (readonly)>
  <HiddenControl(s4=173547763) (readonly)>
  <HiddenControl(type=do) (readonly)>
  <ImageControl(_submit=)>>

<html>
<head>
<title>Fehler bei der Dokumentanzeige</title>
<link rel="stylesheet" href="css/styles.css">
<link rel="stylesheet" href="css/_styles.css">
</head>
<body class="smc_body"><div class="smccontentframe">

<div id="smccontentdata" class="smccontentdata">
<h3></h3>Falscher Seitenaufruf
</div>

</div></body></html>

WARNING: No entry in config.FILE_EXTENSIONS for 'text/html'

Verwendete Config:

# encoding: utf-8

RS = "059540036036"

# Stadtname für Logfile
CITY = 'witten'

# Currently, only "mongodb" is supported
DB_TYPE = 'mongodb'

# Name of the MongoDB database
DB_NAME = 'scrapearis'

# Use "localhost" if MongoDB is running on the same machine
DB_HOST = 'localhost'

# MongoDB default port is 27017
DB_PORT = 27017

# SessionNet base url, should include trailing slash
BASE_URL = 'https://secure.stadt-witten.de/session/bis/'

# Name to identify your crawler to the server
USER_AGENT_NAME = 'scrape-a-ris/0.1'

# Number of seconds to wait between requests. Increase this
# if the systems behaves unstable (seconds)
WAIT_TIME = 0.2

# Log level (DEBUG, INFO, WARNING, ERROR or CRITICAL)
LOG_LEVEL = 'INFO'
# File to log to
LOG_BASE_DIR = '/var/log/ris-scraper/'

##### Page URL masks

URLS = {
    'ASP': {
        # Month calender page
        'CALENDAR_MONTH_PARSE_PATTERN': 'si0040.asp?__cjahr={year:d}&__cmonat={month:d}',
        'CALENDAR_MONTH_PRINT_PATTERN': BASE_URL + 'si0040.asp?__cjahr=%d&__cmonat=%d',

        # Session detail page
        'SESSION_DETAIL_PARSE_PATTERN': 'to0040.asp?__ksinr={session_id:d}',
        'SESSION_DETAIL_PRINT_PATTERN': BASE_URL + 'to0040.asp?__ksinr=%d',

        # Committee detail page
        'COMMITTEE_DETAIL_PARSE_PATTERN': 'kp0040.asp?__kgrnr={committee_id:d}',
        'COMMITTEE_DETAIL_PRINT_PATTERN': BASE_URL + 'kp0040.asp?__kgrnr=%d',

        # Submission detail page
        'SUBMISSION_DETAIL_PARSE_PATTERN': 'vo0050.asp?__kvonr={submission_id:d}',
        'SUBMISSION_DETAIL_PRINT_PATTERN': BASE_URL + 'vo0050.asp?__kvonr=%d',

        # Attachment file download target file name(s)
        'ATTACHMENT_DOWNLOAD_TARGET': ['ydocstart.asp', 'getfile.asp']
    },
    'PHP': {
        # Month calender page
        'CALENDAR_MONTH_PARSE_PATTERN': 'si0040.php?__cjahr={year:d}&__cmonat={month:d}',
        'CALENDAR_MONTH_PRINT_PATTERN': BASE_URL + 'si0040.php?__cjahr=%d&__cmonat=%d',

        # Session detail page
        'SESSION_DETAIL_PARSE_PATTERN': 'to0040.php?__ksinr={session_id:d}',
        'SESSION_DETAIL_PRINT_PATTERN': BASE_URL + 'to0040.php?__ksinr=%d',

        # Committee detail page
        'COMMITTEE_DETAIL_PARSE_PATTERN': 'kp0040.php?__kgrnr={committee_id:d}',
        'COMMITTEE_DETAIL_PRINT_PATTERN': BASE_URL + 'kp0040.php?__kgrnr=%d',

        # Submission detail page
        'SUBMISSION_DETAIL_PARSE_PATTERN': 'vo0050.php?__kvonr={submission_id:d}',
        'SUBMISSION_DETAIL_PRINT_PATTERN': BASE_URL + 'vo0050.php?__kvonr=%d',

        # Attachment file download target file name
        'ATTACHMENT_DOWNLOAD_TARGET': ['ydocstart.php', 'getfile.php']
    }
}

##### XPATH strings to find elements within pages


XPATH = {
    'ASP': {
        # session title within the session details page
        'SESSION_DETAIL_TITLE': '//h1',

        # table fields with session identifier, comittee name and more details
        'SESSION_DETAIL_IDENTIFIER_TD': '//*[@id="smctablevorgang"]/tbody//td',

        # link to committe within the session details page
        #'SESSION_DETAIL_COMMITTEE_LINK': '//li[@class="smcmenucontext_fct_gremium"]/a',
        'SESSION_DETAIL_COMMITTEE_LINK': '//a[@class="smccontextmenulink"]',

        # table rows containing agendaitems on session detail page
        #'SESSION_DETAIL_AGENDA_ROWS': '//*[@id="smc_page_to0040_contenttable1"]/tbody/tr',
        'SESSION_DETAIL_AGENDA_ROWS': '//*[@class="smccontenttable smc_page_to0040_contenttable"]/tbody/tr',

        # link to submission in agenda item row on session detail page
        'SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK': 'td/a',

        # table with session-related attachment downloads on session detail page
        'SESSION_DETAIL_ATTACHMENTS': '//*[@id="smccontent"]/table',

        # distinct class of the box/table containing session-related attachment downloads
        'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME': 'smcdocboxright',

        # Same as above, for the submission detail page (Vorlagen-Detailseite)
        'SUBMISSION_DETAIL_TITLE': '//h1',

        'SUBMISSION_DETAIL_IDENTIFIER_TD': '//*[@id="smctablevorgang"]/tbody//td',

        # "Beratungsfolge" table rows
        'SUBMISSION_DETAIL_AGENDA_ROWS': '//*[@id="smc_page_vo0050_contenttable1"]/tbody/tr',

        'SUBMISSION_DETAIL_ATTACHMENTS': '//*[@id="smccontent"]/table',

        'SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME': 'smcdocboxright',
    },
    'PHP': {
        # session title within the session details page
        'SESSION_DETAIL_TITLE': '//h1',

        # table fields with session identifier, comittee name and more details
        'SESSION_DETAIL_IDENTIFIER_TD': '//*[@id="smctablevorgang"]/tbody//td',

        # link to committe within the session details page
        #'SESSION_DETAIL_COMMITTEE_LINK': '//li[@class="smcmenucontext_fct_gremium"]/a',
        'SESSION_DETAIL_COMMITTEE_LINK': '//a[@class="smccontextmenulink"]',

        # table rows containing agendaitems on session detail page
        'SESSION_DETAIL_AGENDA_ROWS': '//*[@class="smccontenttable smc_page_to0040_contenttable"]/tbody/tr',

        # link to submission in agenda item row on session detail page
        'SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK': './/a',

        # table with session-related attachment downloads on session detail page
        'SESSION_DETAIL_ATTACHMENTS': '//*[@id="smccontent"]//table',

        # distinct class of the box/table containing session-related attachment downloads
        'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME': 'smcdocbox',

        # Same as above, for the submission detail page (Vorlagen-Detailseite)
        'SUBMISSION_DETAIL_TITLE': '//h1',
        'SUBMISSION_DETAIL_IDENTIFIER_TD': '//*[@id="smctablevorgang"]/tbody//td',

        # "Beratungsfolge" table rows
        'SUBMISSION_DETAIL_AGENDA_ROWS': '//*[@class="smccontenttable smc_page_vo0050_contenttable"]/tbody/tr',

        'SUBMISSION_DETAIL_ATTACHMENTS': '//*[@id="smccontent"]//table',

        'SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME': 'smcdocbox'
    }
}

# session title within the session details page
SESSION_DETAIL_TITLE_XPATH = '//h1'

# table fields with session identifier, comittee name and more details
SESSION_DETAIL_IDENTIFIER_TD_XPATH = '//*[@id="smctablevorgang"]/tbody//td'

# link to committe within the session details page
#SESSION_DETAIL_COMMITTEE_LINK_XPATH = '//li[@class="smcmenucontext_fct_gremium"]/a'
SESSION_DETAIL_COMMITTEE_LINK_XPATH = '//a[@class="smccontextmenulink"]'


# table rows containing agendaitems on session detail page
#SESSION_DETAIL_AGENDA_ROWS_XPATH = '//*[@id="smc_page_to0040_contenttable1"]/tbody/tr'
SESSION_DETAIL_AGENDA_ROWS_XPATH = '//*[@class="smccontenttable smc_page_to0040_contenttable"]/tbody/tr'

# link to submission in agenda item row on session detail page
SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK_XPATH = './/a'

# table with session-related attachment downloads on session detail page
SESSION_DETAIL_ATTACHMENTS_XPATH = '//*[@id="smccontent"]//table'
SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME = 'smcdocbox'

# Same as above, for the submission detail page (Vorlagen-Detailseite)
SUBMISSION_DETAIL_TITLE_XPATH = SESSION_DETAIL_TITLE_XPATH
SUBMISSION_DETAIL_IDENTIFIER_TD_XPATH = SESSION_DETAIL_IDENTIFIER_TD_XPATH
# "Beratungsfolge" table
SUBMISSION_DETAIL_AGENDA_ROWS_XPATH = '//*[@class="smccontenttable smc_page_vo0050_contenttable"]/tbody/tr'
SUBMISSION_DETAIL_ATTACHMENTS_XPATH = SESSION_DETAIL_ATTACHMENTS_XPATH
SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME = SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME


###### Result normalization mapping

RESULT_STRINGS = {

}

FILE_EXTENSIONS = {
    'application/pdf': 'pdf',
    'image/tiff': 'tif',
    'image/jpeg': 'jpg',
    'application/vnd.ms-powerpoint': 'pptx',
    'application/msword': 'doc',
    'application/zip': 'zip'
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions