-
Notifications
You must be signed in to change notification settings - Fork 8
Open
Description
Dort wird eine leicht andere Version der ASP-Linie eingesetzt, welche u.a. auch eine andere Downloader URL hat. Das Problem tritt bei allen PDFs auf.
Log sieht mit einigen Zusatzausgaben (u.a. das Resultat) so aus:
Getting submission 3192 from https://secure.stadt-witten.de/session/bis/vo0050.asp?__kvonr=3192
Getting attachment '0729_V_15_Vorlage'
<POST https://secure.stadt-witten.de/session/bis/ydocstart.asp application/x-www-form-urlencoded
<HiddenControl(DO=812518775748617743470535354582284735550864428772745503172237060132754656) (readonly)>
<HiddenControl(DT=0729_V_15_Vorlage) (readonly)>
<HiddenControl(CLIENT=6) (readonly)>
<HiddenControl(DEN=pdf) (readonly)>
<HiddenControl(SF=) (readonly)>
<HiddenControl(LO=) (readonly)>
<HiddenControl(s1=775882716) (readonly)>
<HiddenControl(s2=815257364) (readonly)>
<HiddenControl(s3=587552575) (readonly)>
<HiddenControl(s4=173547763) (readonly)>
<HiddenControl(type=do) (readonly)>
<ImageControl(_submit=)>>
<html>
<head>
<title>Fehler bei der Dokumentanzeige</title>
<link rel="stylesheet" href="css/styles.css">
<link rel="stylesheet" href="css/_styles.css">
</head>
<body class="smc_body"><div class="smccontentframe">
<div id="smccontentdata" class="smccontentdata">
<h3></h3>Falscher Seitenaufruf
</div>
</div></body></html>
WARNING: No entry in config.FILE_EXTENSIONS for 'text/html'
Verwendete Config:
# encoding: utf-8
RS = "059540036036"
# Stadtname für Logfile
CITY = 'witten'
# Currently, only "mongodb" is supported
DB_TYPE = 'mongodb'
# Name of the MongoDB database
DB_NAME = 'scrapearis'
# Use "localhost" if MongoDB is running on the same machine
DB_HOST = 'localhost'
# MongoDB default port is 27017
DB_PORT = 27017
# SessionNet base url, should include trailing slash
BASE_URL = 'https://secure.stadt-witten.de/session/bis/'
# Name to identify your crawler to the server
USER_AGENT_NAME = 'scrape-a-ris/0.1'
# Number of seconds to wait between requests. Increase this
# if the systems behaves unstable (seconds)
WAIT_TIME = 0.2
# Log level (DEBUG, INFO, WARNING, ERROR or CRITICAL)
LOG_LEVEL = 'INFO'
# File to log to
LOG_BASE_DIR = '/var/log/ris-scraper/'
##### Page URL masks
URLS = {
'ASP': {
# Month calender page
'CALENDAR_MONTH_PARSE_PATTERN': 'si0040.asp?__cjahr={year:d}&__cmonat={month:d}',
'CALENDAR_MONTH_PRINT_PATTERN': BASE_URL + 'si0040.asp?__cjahr=%d&__cmonat=%d',
# Session detail page
'SESSION_DETAIL_PARSE_PATTERN': 'to0040.asp?__ksinr={session_id:d}',
'SESSION_DETAIL_PRINT_PATTERN': BASE_URL + 'to0040.asp?__ksinr=%d',
# Committee detail page
'COMMITTEE_DETAIL_PARSE_PATTERN': 'kp0040.asp?__kgrnr={committee_id:d}',
'COMMITTEE_DETAIL_PRINT_PATTERN': BASE_URL + 'kp0040.asp?__kgrnr=%d',
# Submission detail page
'SUBMISSION_DETAIL_PARSE_PATTERN': 'vo0050.asp?__kvonr={submission_id:d}',
'SUBMISSION_DETAIL_PRINT_PATTERN': BASE_URL + 'vo0050.asp?__kvonr=%d',
# Attachment file download target file name(s)
'ATTACHMENT_DOWNLOAD_TARGET': ['ydocstart.asp', 'getfile.asp']
},
'PHP': {
# Month calender page
'CALENDAR_MONTH_PARSE_PATTERN': 'si0040.php?__cjahr={year:d}&__cmonat={month:d}',
'CALENDAR_MONTH_PRINT_PATTERN': BASE_URL + 'si0040.php?__cjahr=%d&__cmonat=%d',
# Session detail page
'SESSION_DETAIL_PARSE_PATTERN': 'to0040.php?__ksinr={session_id:d}',
'SESSION_DETAIL_PRINT_PATTERN': BASE_URL + 'to0040.php?__ksinr=%d',
# Committee detail page
'COMMITTEE_DETAIL_PARSE_PATTERN': 'kp0040.php?__kgrnr={committee_id:d}',
'COMMITTEE_DETAIL_PRINT_PATTERN': BASE_URL + 'kp0040.php?__kgrnr=%d',
# Submission detail page
'SUBMISSION_DETAIL_PARSE_PATTERN': 'vo0050.php?__kvonr={submission_id:d}',
'SUBMISSION_DETAIL_PRINT_PATTERN': BASE_URL + 'vo0050.php?__kvonr=%d',
# Attachment file download target file name
'ATTACHMENT_DOWNLOAD_TARGET': ['ydocstart.php', 'getfile.php']
}
}
##### XPATH strings to find elements within pages
XPATH = {
'ASP': {
# session title within the session details page
'SESSION_DETAIL_TITLE': '//h1',
# table fields with session identifier, comittee name and more details
'SESSION_DETAIL_IDENTIFIER_TD': '//*[@id="smctablevorgang"]/tbody//td',
# link to committe within the session details page
#'SESSION_DETAIL_COMMITTEE_LINK': '//li[@class="smcmenucontext_fct_gremium"]/a',
'SESSION_DETAIL_COMMITTEE_LINK': '//a[@class="smccontextmenulink"]',
# table rows containing agendaitems on session detail page
#'SESSION_DETAIL_AGENDA_ROWS': '//*[@id="smc_page_to0040_contenttable1"]/tbody/tr',
'SESSION_DETAIL_AGENDA_ROWS': '//*[@class="smccontenttable smc_page_to0040_contenttable"]/tbody/tr',
# link to submission in agenda item row on session detail page
'SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK': 'td/a',
# table with session-related attachment downloads on session detail page
'SESSION_DETAIL_ATTACHMENTS': '//*[@id="smccontent"]/table',
# distinct class of the box/table containing session-related attachment downloads
'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME': 'smcdocboxright',
# Same as above, for the submission detail page (Vorlagen-Detailseite)
'SUBMISSION_DETAIL_TITLE': '//h1',
'SUBMISSION_DETAIL_IDENTIFIER_TD': '//*[@id="smctablevorgang"]/tbody//td',
# "Beratungsfolge" table rows
'SUBMISSION_DETAIL_AGENDA_ROWS': '//*[@id="smc_page_vo0050_contenttable1"]/tbody/tr',
'SUBMISSION_DETAIL_ATTACHMENTS': '//*[@id="smccontent"]/table',
'SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME': 'smcdocboxright',
},
'PHP': {
# session title within the session details page
'SESSION_DETAIL_TITLE': '//h1',
# table fields with session identifier, comittee name and more details
'SESSION_DETAIL_IDENTIFIER_TD': '//*[@id="smctablevorgang"]/tbody//td',
# link to committe within the session details page
#'SESSION_DETAIL_COMMITTEE_LINK': '//li[@class="smcmenucontext_fct_gremium"]/a',
'SESSION_DETAIL_COMMITTEE_LINK': '//a[@class="smccontextmenulink"]',
# table rows containing agendaitems on session detail page
'SESSION_DETAIL_AGENDA_ROWS': '//*[@class="smccontenttable smc_page_to0040_contenttable"]/tbody/tr',
# link to submission in agenda item row on session detail page
'SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK': './/a',
# table with session-related attachment downloads on session detail page
'SESSION_DETAIL_ATTACHMENTS': '//*[@id="smccontent"]//table',
# distinct class of the box/table containing session-related attachment downloads
'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME': 'smcdocbox',
# Same as above, for the submission detail page (Vorlagen-Detailseite)
'SUBMISSION_DETAIL_TITLE': '//h1',
'SUBMISSION_DETAIL_IDENTIFIER_TD': '//*[@id="smctablevorgang"]/tbody//td',
# "Beratungsfolge" table rows
'SUBMISSION_DETAIL_AGENDA_ROWS': '//*[@class="smccontenttable smc_page_vo0050_contenttable"]/tbody/tr',
'SUBMISSION_DETAIL_ATTACHMENTS': '//*[@id="smccontent"]//table',
'SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME': 'smcdocbox'
}
}
# session title within the session details page
SESSION_DETAIL_TITLE_XPATH = '//h1'
# table fields with session identifier, comittee name and more details
SESSION_DETAIL_IDENTIFIER_TD_XPATH = '//*[@id="smctablevorgang"]/tbody//td'
# link to committe within the session details page
#SESSION_DETAIL_COMMITTEE_LINK_XPATH = '//li[@class="smcmenucontext_fct_gremium"]/a'
SESSION_DETAIL_COMMITTEE_LINK_XPATH = '//a[@class="smccontextmenulink"]'
# table rows containing agendaitems on session detail page
#SESSION_DETAIL_AGENDA_ROWS_XPATH = '//*[@id="smc_page_to0040_contenttable1"]/tbody/tr'
SESSION_DETAIL_AGENDA_ROWS_XPATH = '//*[@class="smccontenttable smc_page_to0040_contenttable"]/tbody/tr'
# link to submission in agenda item row on session detail page
SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK_XPATH = './/a'
# table with session-related attachment downloads on session detail page
SESSION_DETAIL_ATTACHMENTS_XPATH = '//*[@id="smccontent"]//table'
SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME = 'smcdocbox'
# Same as above, for the submission detail page (Vorlagen-Detailseite)
SUBMISSION_DETAIL_TITLE_XPATH = SESSION_DETAIL_TITLE_XPATH
SUBMISSION_DETAIL_IDENTIFIER_TD_XPATH = SESSION_DETAIL_IDENTIFIER_TD_XPATH
# "Beratungsfolge" table
SUBMISSION_DETAIL_AGENDA_ROWS_XPATH = '//*[@class="smccontenttable smc_page_vo0050_contenttable"]/tbody/tr'
SUBMISSION_DETAIL_ATTACHMENTS_XPATH = SESSION_DETAIL_ATTACHMENTS_XPATH
SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME = SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME
###### Result normalization mapping
RESULT_STRINGS = {
}
FILE_EXTENSIONS = {
'application/pdf': 'pdf',
'image/tiff': 'tif',
'image/jpeg': 'jpg',
'application/vnd.ms-powerpoint': 'pptx',
'application/msword': 'doc',
'application/zip': 'zip'
}
Metadata
Metadata
Assignees
Labels
No labels