diff --git a/client/styles/site.sass b/client/styles/site.sass index 4bb31a7..e160966 100644 --- a/client/styles/site.sass +++ b/client/styles/site.sass @@ -604,3 +604,15 @@ ul.petitions_creators li margin: 0 padding: 0 + +.lobbyreg_info + margin-left: 5px + color: white + padding: 1px 4px + border-radius: 5px + font-size: 80% + background-color: grey + cursor: default + display: inline-block + -webkit-user-select: none + user-select: none diff --git a/offenesparlament/docs/source/scraper/lobby_register_scraper.rst b/offenesparlament/docs/source/scraper/lobby_register_scraper.rst new file mode 100644 index 0000000..d4e8c11 --- /dev/null +++ b/offenesparlament/docs/source/scraper/lobby_register_scraper.rst @@ -0,0 +1,82 @@ +Scraper: Lobby Register +========================== + +General notes about the lobby register +-------------------------------------- + +Since 1st of January 2013 Austria has a mandatory lobby register: +http://www.lobbyreg.justiz.gv.at + + +What kind of information has to be disclosed depends on the lobby register class: + + Lobby register classes: + A Lobbying companies (A1) and their clients and fields of activity (A2, not public) - a.e. "Kovar & Partners GmbH" + B Companies that employ in-house lobbyists - a.e. "Bayer Austria Gesellschaft m.b.H." + C Self-governing bodies - a.e. "Architekten- und Ingenieurkonsulentenkammer für Steiermark und Kärnten" + D interest groups - a.e. "Interessenvertretung Gemeinnütziger Organisationen" + + Depending on the class the data set might include information like: + - Name + - Address + - Commercial register number + - Information on clients (not public) + - Information on spendings/costs/revenues + - Lobbyists (name, date of birth) + - Homepage + - … + + +Information on the registration and data acquisition process(German): +http://www.lobbyreg.justiz.gv.at/edikte/ex/edparm3.nsf/h/ir_Leitfaden/$file/Leitfaden.pdf + + +Legal background(German): +https://www.ris.bka.gv.at/Dokumente/BgblAuth/BGBLA_2012_I_64/BGBLA_2012_I_64.pdf + + +Notes on www.lobbyreg.justiz.gv.at +---------------------------------- + +Only lobby register entries of class 'A1' and 'B' are scraped because 'C' and 'D' do not include informations on lobbyists. + +For scraping the overview pages for class 'A1' and class 'B' are used: + - http://www.lobbyreg.justiz.gv.at/edikte/ir/iredi18.nsf/liste!OpenForm&subf=r&RestrictToCategory=A1 + - http://www.lobbyreg.justiz.gv.at/edikte/ir/iredi18.nsf/liste!OpenForm&subf=r&RestrictToCategory=B + +When starting from 'www.lobbyreg.justiz.gv.at' the overview pages are accessible through the menu entry "Liste nach Registerabteil". + +On the overview page the data is provided as an table, where each row corresponds to one lobby register entry and consists of the following columns: + 'Nr': position of the lobby register entry in selected view + 'Bezeichnung/Firma': name, address, commercial register number + 'Registerzahl': lobby register number + 'Registerabteilung': register class + 'Details':details, lobbyists' names + 'Letzte Änderung': date of last change/update of the lobby register entry + + +notes on lobby register number: +The lobby register number is supposed to be a unique id. However scraping showed that the lobby register number LIVR-00303 was asigned twice within the same class to 'Aktienforum - Österreichischer Verband für Aktien-Emittenten und -Investoren, Lothringerstraße 12, 1031 Wien' and 'Österreichischer Apothekerverband, Spitalgasse 31, 1090 Wien'. This was resolved after it was brought to the attention of ministry of justice(BMJ). (see: https://twitter.com/fin/status/676791501121298432) +The scraper assumes uniqueness of the register numbers and would overwrite entries in case of duplication. + + +Scraper structure +----------------- + +The scraper only uses overview sites. +At every scraper run data which currently isn't present in the official lobby register is deleted. This is facilitated by the last_seen fields in the models. + + +Model structure +--------------- + + The LobbyRegisterEntry model (Representation of an entry in the Austrian lobby register) includes: + register_number: lobby register number + name: name and address of the entity + commercial_register_number: Firmenbuchnummer + register_class: 'A1' or 'B' + last_change: as given on website + last_seen: timestamp of last model save + + + The LobbyRegisterPerson model (Lobbyist mentioned in the Austrian lobby register) only includes the name of the lobbyist and refers to corresponding LobbyRegisterEntry. It also includes a last_seen field. \ No newline at end of file diff --git a/offenesparlament/offenesparlament/static/css/site.css b/offenesparlament/offenesparlament/static/css/site.css index 1619960..9de747c 100644 --- a/offenesparlament/offenesparlament/static/css/site.css +++ b/offenesparlament/offenesparlament/static/css/site.css @@ -984,4 +984,16 @@ ul.petitions_creators { margin: 0; padding: 0; } +.lobbyreg_info { + margin-left: 5px; + color: white; + padding: 1px 4px; + border-radius: 5px; + font-size: 80%; + background-color: grey; + cursor: default; + display: inline-block; + -webkit-user-select: none; + user-select: none; } + /*# sourceMappingURL=site.css.map */ diff --git a/offenesparlament/offenesparlament/static/scripts/app.js b/offenesparlament/offenesparlament/static/scripts/app.js index 65040e5..ddc884d 100644 --- a/offenesparlament/offenesparlament/static/scripts/app.js +++ b/offenesparlament/offenesparlament/static/scripts/app.js @@ -1547,7 +1547,7 @@ module.exports = { },{"../actions/AnysearchActions.coffee":1,"backbone":23}],23:[function(require,module,exports){ (function (global){ -// Backbone.js 1.2.3 +// Backbone.js 1.3.2 // (c) 2010-2016 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors // Backbone may be freely distributed under the MIT license. @@ -1593,7 +1593,7 @@ module.exports = { var slice = Array.prototype.slice; // Current version of the library. Keep in sync with `package.json`. - Backbone.VERSION = '1.2.3'; + Backbone.VERSION = '1.3.2'; // For Backbone's purposes, jQuery, Zepto, Ender, or My Library (kidding) owns // the `$` variable. @@ -1857,6 +1857,7 @@ module.exports = { Events.once = function(name, callback, context) { // Map the event into a `{event: once}` object. var events = eventsApi(onceMap, {}, name, callback, _.bind(this.off, this)); + if (typeof name === 'string' && context == null) callback = void 0; return this.on(events, callback, context); }; @@ -3828,7 +3829,7 @@ module.exports = invariant; var base64 = require('base64-js') var ieee754 = require('ieee754') -var isArray = require('is-array') +var isArray = require('isarray') exports.Buffer = Buffer exports.SlowBuffer = SlowBuffer @@ -5364,7 +5365,7 @@ function blitBuffer (src, dst, offset, length) { } }).call(this,typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {}) -},{"base64-js":29,"ieee754":30,"is-array":31}],29:[function(require,module,exports){ +},{"base64-js":29,"ieee754":30,"isarray":31}],29:[function(require,module,exports){ var lookup = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'; ;(function (exports) { @@ -5577,38 +5578,10 @@ exports.write = function (buffer, value, offset, isLE, mLen, nBytes) { } },{}],31:[function(require,module,exports){ +var toString = {}.toString; -/** - * isArray - */ - -var isArray = Array.isArray; - -/** - * toString - */ - -var str = Object.prototype.toString; - -/** - * Whether or not the given `val` - * is an array. - * - * example: - * - * isArray([]); - * // > true - * isArray(arguments); - * // > false - * isArray(''); - * // > false - * - * @param {mixed} val - * @return {bool} - */ - -module.exports = isArray || function (val) { - return !! val && '[object Array]' == str.call(val); +module.exports = Array.isArray || function (arr) { + return toString.call(arr) == '[object Array]'; }; },{}],32:[function(require,module,exports){ @@ -19141,7 +19114,7 @@ var nextFrame = typeof window !== 'undefined' ? (function () { return window.requestAnimationFrame || window.webkitRequestAnimationFrame || window.mozRequestAnimationFrame || function (callback) { window.setTimeout(callback, 1000 / 60); }; -})() : undefined; // If window is undefined, then we can't define a nextFrame function +})().bind(window) : undefined; // If window is undefined, then we can't define a nextFrame function var AutosizeInput = React.createClass({ displayName: 'AutosizeInput', diff --git a/offenesparlament/offenesparlament/templates/gesetz_detail.html b/offenesparlament/offenesparlament/templates/gesetz_detail.html index 530ecc3..cf8feb3 100644 --- a/offenesparlament/offenesparlament/templates/gesetz_detail.html +++ b/offenesparlament/offenesparlament/templates/gesetz_detail.html @@ -145,12 +145,14 @@

Es gab im vorparlamentarischen Prozess insgesamt {{ {{ opinion.date }} {{ opinion.parl_id }} - {{ opinion.entity }} + {{ opinion.entity }} {% if option.entity.matching_lobbyreg_entry %}!{% endif %} - {% for document in opinion.documents.all %} -   - {% endfor %} - Parlamentsseite + {% spaceless %} + {% for document in opinion.documents.all %} +   + {% endfor %} + Parlamentsseite + {% endspaceless %} {% endfor %} diff --git a/offenesparlament/op_scraper/management/commands/match_lobbyreg.py b/offenesparlament/op_scraper/management/commands/match_lobbyreg.py new file mode 100644 index 0000000..9c048d1 --- /dev/null +++ b/offenesparlament/op_scraper/management/commands/match_lobbyreg.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +from django.core.management.base import BaseCommand +from op_scraper.models import Entity, LobbyRegisterPerson + + +class Command(BaseCommand): + def run_from_argv(self, argv): + self._argv = argv + self.execute() + + def handle(self, *args, **options): + Entity.try_matching_lobbyreg_entries() + print 'matched %s Entities' % (Entity.objects.filter(matching_lobbyreg_entry__isnull=False).count(),) diff --git a/offenesparlament/op_scraper/models.py b/offenesparlament/op_scraper/models.py index 584943a..e1b43b1 100644 --- a/offenesparlament/op_scraper/models.py +++ b/offenesparlament/op_scraper/models.py @@ -116,12 +116,25 @@ class Entity(models.Model): email = models.EmailField(null=True, blank=True) phone = PhoneNumberField(null=True, blank=True) + matching_lobbyreg_entry = models.ForeignKey('LobbyRegisterPerson', null=True, blank=True, on_delete=models.SET_NULL) + class Meta: unique_together = ("title", "title_detail") def __unicode__(self): return self.title + @classmethod + def try_matching_lobbyreg_entries(cls): + """ try to match entity names with persons mentioned in the lobby register. matching method: case-insensitive contains + """ + + for x in LobbyRegisterPerson.objects.all(): + for y in cls.objects.filter(title__icontains=x.name): + y.matching_lobbyreg_entry = x + y.save() + + class Document(models.Model): @@ -983,6 +996,38 @@ def __unicode__(self): return u'Unterschrift von {} ({}-{}) am {} für {}'\ .format(self.full_name, self.postal_code, self.location, self.date, self.petition) + +class LobbyRegisterEntry(models.Model): + """ + Representation of an entry in the Austrian lobby register + http://www.lobbyreg.justiz.gv.at/ + """ + register_number = models.CharField(max_length=20) + commercial_register_number = models.CharField(max_length=20) + name = models.TextField() + register_class = models.CharField(max_length=2) + last_change = models.CharField(max_length=20) + + last_seen = models.DateTimeField(auto_now=True) + + def __unicode__(self): + return u'%s' % self.name + + +class LobbyRegisterPerson(models.Model): + """ + Lobbyist mentioned in the Austrian lobby register + """ + name = models.TextField() + + last_seen = models.DateTimeField(auto_now=True) + + # Relationships + entry = models.ForeignKey(LobbyRegisterEntry) + + def __unicode__(self): + return u'%s (%s)' % (self.name, self.entry.name,) + class Debate(models.Model): diff --git a/offenesparlament/op_scraper/scraper/parlament/resources/extractors/lobbyregister.py b/offenesparlament/op_scraper/scraper/parlament/resources/extractors/lobbyregister.py new file mode 100644 index 0000000..7a38b93 --- /dev/null +++ b/offenesparlament/op_scraper/scraper/parlament/resources/extractors/lobbyregister.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +from scrapy import Selector +from parlament.resources.extractors import SingleExtractor + +class LOBBIES(SingleExtractor): + + XPATH = '//*[@id="ergebnisliste"]/table/tbody/tr' + + @classmethod + def xt(cls, response): + lobbies = [] + + rows = [Selector(text=row.extract()) + for row in response.xpath(cls.XPATH)] + + for row in rows: + lobby = {} + lobby_name=row.xpath('//td[2]/text()').extract()[0] + if lobby_name[-1] ==')' and lobby_name[-9]=='(': + lobby['name']=lobby_name[0:(-9)] + lobby['commercial_register_number']=lobby_name[-8:-1] + else: + lobby['name']=lobby_name + lobby['commercial_register_number'] = '' + lobby['register_number']=row.xpath('//td[3]/a/text()').extract()[0] + lobby['register_class']=row.xpath('//td[4]/text()').extract()[0] + lobby['lobbyists']=row.xpath('//td[5]/text()').extract() + lobby['last_change']=row.xpath('//td[6]/text()').extract()[0] + lobbies.append(lobby) + + return lobbies \ No newline at end of file diff --git a/offenesparlament/op_scraper/scraper/parlament/spiders/lobbyregister.py b/offenesparlament/op_scraper/scraper/parlament/spiders/lobbyregister.py new file mode 100644 index 0000000..eeb995e --- /dev/null +++ b/offenesparlament/op_scraper/scraper/parlament/spiders/lobbyregister.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +import scrapy +from op_scraper.models import LobbyRegisterEntry, LobbyRegisterPerson +from parlament.resources.extractors.lobbyregister import * + + +class LobbyRegisterSpider(scrapy.Spider): + name = 'lobbyregister' + titel = 'Lobby Register Spider' + + start_urls = ['http://www.lobbyreg.justiz.gv.at/edikte/ir/iredi18.nsf/liste!OpenForm&subf=r&RestrictToCategory=A1', 'http://www.lobbyreg.justiz.gv.at/edikte/ir/iredi18.nsf/liste!OpenForm&subf=r&RestrictToCategory=B'] + + def parse(self, response): + + lobbies = LOBBIES.xt(response) + + oldest_date = None + + for lobby in lobbies: + + lobbyists = lobby.pop('lobbyists') + + entry, created = LobbyRegisterEntry.objects.update_or_create( + register_number=lobby['register_number'], + defaults=lobby) + + if not oldest_date: + oldest_date = entry.last_seen + + for name in lobbyists: + LobbyRegisterPerson.objects.get_or_create( + entry=entry, + name=name) + + LobbyRegisterEntry.objects.filter(last_seen__lt = oldest_date).delete() + LobbyRegisterPerson.objects.filter(last_seen__lt = oldest_date).delete()