% by Mirella M. Moro; version: January/18/2012 @ 04:16pm
% -- 01/18/2012: more discussion on SBBD + JIDM; overall revision
% -- 09/03/2010: bib file with names for proceedings and journals; cls with shrinked {received}
% -- 08/27/2010: appendix, table example, more explanation within comments, editors' data

\documentclass[jidm,a4paper]{jidm} % NOTE: JIDM is published on A4 paper
\usepackage{graphicx,url}  % for using figures and url format
\usepackage[T1]{fontenc}   % avoids warnings such as "LaTeX Font Warning: Font shape 'OMS/cmtt/m/n' undefined"

%\usepackage{color}         % temp usage!

%\usepackage{cite} % NOTE: do **not** include this package because it conflicts with jidm.bst

% Standard definitions
\newtheorem{theorem}{Theorem}[section]
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newdef{definition}[theorem]{Definition}
\newdef{remark}[theorem]{Remark}

% New environment definition
\newenvironment{latexcode}
{\ttfamily\vspace{0.1in}\setlength{\parindent}{18pt}}
{\vspace{0.1in}}

% ALL FIELDS UNTIL BEGIN{document} ARE MANDATORY

% The following data (volume, number and page) are given by the editors prior to publishing your article
\jidmVolume{6}
\jidmNumber{1}
\jidmYear{15}
\jidmMonth{June}
\setcounter{page}{1}


% Includes headers with simplified name of the authors and article title
\markboth{Maxwell G. de Oliveira \emph{et al.}}
{Producing Volunteered Geographic Information from Social Media for LBSN improvement}
%  -> \markboth{}{}
%         takes 2 arguments
%         ex: \markboth{M. M. Moro}{Any article title}


% Title of the article
\title{Producing Volunteered Geographic Information from Social Media for LBSN improvement}


% List of authors
%IF THERE ARE TWO or more institutions, please use:
%\author{Name of Author1\inst{1}, Name of Author2\inst{2}, Name of Author3\inst{2}}
\author{Maxwell Guimar\~aes de Oliveira, Cl\'audio de Souza Baptista, Cl\'audio E. C. Campelo, Jos\'e Amilton Moura Acioli Filho and Ana Gabrielle Ramos Falc\~ao}

%Affiliation and email
\institute{Laborat\'orio de Sistemas de Informa\c{c}\~ao, Universidade Federal de Campina Grande (UFCG) - Brazil \\ \email{maxwell@ufcg.edu.br, \{baptista, campelo\}@dsc.ufcg.edu.br, \\ \{joseamilton, anagabrielle\}@copin.ufcg.edu.br}
%\and Universidade Federal Fluminense, Brazil \\ \email{vanessa@ic.uff.br}
% IF THERE IS ANOTHER INSTITUTION:
%\and Name_of_the_second_institution \\
%\email{address@whatever.com}
}


% Article abstract - it should be from 100 to 300 words (past has 119 / now 221)
\begin{abstract}
Volunteered Geographic Information (VGI) emerged from the widespread of devices featuring GPS and Internet connectivity around the world. It has enabled the easier and increased production of spatial data, and a deeper engagement of people with everything involving location. Such scenario has led to the emergence of Location-Based Social Networks (LBSN), which allow users to be assigned to space related content. LBSN environments have proved to be quite useful, however, keeping users willing to contribute (i.e., maintaining such environments in continuous operation) has appeared to be challenging. In addressing this issue, we have considered applying Geographical Information Retrieval (GIR) techniques to produce VGI from social media streams on the Web, aiming to improve LBSN with valuable up-to-date content in an automated way. We rely on GIR techniques such as geoparsing the message bodies instead of considering previously geotagged information since we cannot ensure that an embedded geolocation is the same location that the messages refers to. An artifact for automatically producing VGI based on social media content is described and validated using a real-world case study. We harvested tweets during the FIFA Confederations Cup and tried to produce valuable VGI from the message stream. Our results proved to be promising for leveraging VGI from social media.
\end{abstract}


% ACM Computing Classification System categories
\category{H.2.8}{Database Applications}{Spatial databases and GIS}
\category{H.3}{Information Storage and Retrieval}{Miscellaneous}
\category{I.7}{Document and Text Processing}{Miscellaneous}

% Categories and Descriptors are available at the 1998 ACM Computing Classification System
% http://www.acm.org/about/class/1998/
%  -> \category{}{}{}
%         takes 3 arguments for the Computing Reviews Classification Scheme.
%         ex: \category{D.3.3}{Programming Languages}{Language Constructs and Features}
%                   [data types and structures]
%                   the last argument, in square brackets, is optional.

% Article keywords
\keywords{Geoparsing, GIR, LBSN, Twitter, VGI}
%  -> \keywords{} (in alphabetical order \keywords{document processing, sequences,
%                      string searching, subsequences, substrings})


% THE ARTICLE BEGINS
\begin{document}
%
% This is optional:
\begin{bottomstuff}
Research supported by the \emph{Conselho Nacional de Desenvolvimento Cientifico e Tecnologico}, CNPq, Brazil.
\end{bottomstuff}
%
\maketitle
%
% ARTICLE NEW SECTION
\section{Introduction}
%
The easier and increased production of spatial data has enabled a deeper engagement of people with everything involving location. It can be mainly explained by the dissemination of devices featuring GPS and the spread of Internet connectivity around the world. Space related information have been shared in many domains by thousands of users per minute. On the other hand, a large number of users have been increasingly consuming such information by means of location-based applications.

Such phenomenon enabled the emergence of Volunteered Geographic Information (VGI) as an alternative and powerful spatial data source on the Web. VGI consists of data pooled with the geographic context. These spatial data are produced and disseminated by individuals spread throughout the world, forming an environment widely known as Crowdsourcing \cite{surowiecki_wisdom_2005}. These individuals are called volunteers and most of them are neither experts in Geography nor experts in Geographic Information Sciences. Generally, they are ordinary people interested in sharing their viewpoints and knowledge about geographic locations \cite{goodchild_citizens_2007}.

This scenario has led to the emergence of applications such as the Location-Based Social Networks (LBSN). The LBSNs provide context-aware services which allow users to be assigned to content \cite{vicente_location-related_2011}. LBSNs also provide many different types of services, from entertainment to public utilities. In such a social network, much information is voluntarily created, be it textual, multimedia or geographic. The Crowd4City, for instance, is a LBSN that can be applied to the domain of smart cities, with support to participatory human sensors \cite{falcao_crowd4city:_2012}. Crowd4City  aims to create an environment for identification and discussion of matters concerning the government of the cities, a common interest of the population.

Despite initiatives like Crowd4City, one of the main challenges in the use of human sensors has been keeping them willing to contribute and consequently maintaining the LBSNs continuously. Several factors may affect the users' motivation, such as the learning curve for correct operation of a LBSN and the time spent on such activities. Typically, only a few users are in charge of providing a significant volume of information. This issue is visible in terms of geographic location, where many areas around the world are mapped by only few users \cite{haklay_openstreetmap:_2008}. Therefore, it becomes necessary to find alternative methods of keeping the LBSNs up-to-date even when the volume of contributions from volunteers is below the expected volume. In addressing this issue, we have considered applying Geographic Information Retrieval (GIR) techniques to automatically produce VGI for LBSN improvement.

One of the sub-areas of Geographic Information Retrieval (GIR) focus on the development of techniques for identifying geographic locations associated with text documents. Research in this area poses many challenges, including those relating to Natural Language Processing (NLP), handling of uncertainty, disambiguation, and context identification \cite{bordogna_geographic_2012}. Through these GIR techniques, it is possible to process text (from websites, blogs and social networks, for example) and then assign specific geographic locations to it \cite{purves_geographic_2011}. In this sense, previous work has addressed the assignment of geographic locations to Web documents, including social network messages such as tweets from Twitter\footnote{Twitter: http://www.twitter.com} \cite{rupp_customising_2013,watanabe_jasmine:_2011}.

Our work builds on the hypothesis that texts from the Web, such as messages publicly exchanged in social networks, when processed by a mechanism for identifying the geographic locations they are associated with, could automatically turn into useful information to feed location-based applications such as LBSNs. Hence, we consider that geographic information originated from  social media streams can also be classified as VGI. Thus, social media producers can become non-intentional volunteers in the production of VGI, which could automatically be made available to users of LBSN applications.

In order to validate that hypothesis, this article presents an approach to automatically producing  VGI from social media streams for enriching LBSNs. This approach is based on the application of GIR techniques to text messages from social media. Although public data originated from social media already contain some geotagged information in the form of metadata (such as the geographic position of the user who posted a message), users can freely disseminate information about the most diversified geographic contexts, which frequently mismatch their geographic position at the moment that the information is shared. Thus, by generating VGI from a text message, we can ensure the identified locations actually relate to the places the message content refers to. In our envisioned scenario, the produced VGI should become available for LBSN users, who will be the main consumers and also validators of that information, being capable of pointing to its inconsistencies as well as stressing its relevance, and consequently enriching the crowdsourcing environment.

The main contributions of this article are: the description  of a mechanism developed for automatically producing Volunteered Geographic Information, based on the content of social media texts; and a discussion about the task of geoparsing informal texts published on social media and the value that its information may reveal when the geographic context is explored. The remainder of this article is structured as follows. Section 2 describes our proposal. Section 3 addresses a case study carried out to evaluate the proposed ideas. Section 4 discusses related work. Finally, section 5 summarises and highlights further work to be undertaken.

\section{An approach for Automated Production of VGI from Social Media}

This section presents our approach for automated production of VGI based on crowdsourced social media. We propose a systematic approach aiming to automatically produce VGI based on information published on Twitter microtexts. The expected result is the production of spatiotemporal markers with the content of these microtexts, which can be widely viewed and handled by the users of a LBSN. We understand that these spatiotemporal markers conforms to VGI policy as they are produced by crowdsourcing data. Thus, social media users assume the role of volunteers in the VGI context.

We believe the automated produced VGI may help LBSN users interested in learning more about specific geographical locations from people who freely share valuable information on social media. An overview of the proposed approach is presented in Figure \ref{fig:overview}.

\begin{figure}[b] % FIGURES SHOULD BE AT THE TOP [t] OR BOTTOM [b} OF PAGES
\begin{center}	% FIGURES SHOULD BE CENTERED
		\includegraphics[width=1.0\textwidth]{figs/Fig1.png} % YOU MAY SHRINK YOUR FIGURE WITH width
\caption{The main idea of our proposal: turning social media messages into valuable VGI in a LBSN}
    \label{fig:overview}	
       % ALWAYS INCLUDE CAPTIONS IN YOUR FIGURES
	 % USE THE CONTENT WITHIN label FOR REFERENCING IT (USE \ref WITHIN THE TEXT)
\end{center}
\end{figure}

Figure \ref{fig:overview} (left side) illustrates the social networks, such as Twitter and Facebook\footnote{Facebook: http://www.facebook.com}, as information sources for producing the VGI visualized in a LBSN such as the Crowd4City (right side). In this context, each message posted by the users of these networks can be turned into VGI, which can then be used by LSBN users.

In order to achieve this goal, it is necessary to have a computational process that involves the capture and the treatment of information using GIR techniques, as illustrated in Figure \ref{fig:process}. The computational process of this approach involves, basically, four distinct stages: Crawling, Geoparsing, Georeferencing and VGI Production. The initial stage is Crawling, in which the messages posted on the social networks are captured. We developed an algorithm to capture real-time microtexts (tweets) posted on Twitter. This algorithm focuses on the original text of the messages posted on the network, discarding the other available metadata, except the timestamp that indicates the time the message was published.	

\begin{figure}[tb] % FIGURES SHOULD BE AT THE TOP [t] OR BOTTOM [b} OF PAGES
\vspace{0.5cm}
\begin{center}	% FIGURES SHOULD BE CENTERED
		\includegraphics[width=0.98\textwidth]{figs/Fig2.pdf} % YOU MAY SHRINK YOUR FIGURE WITH width
	\caption{Computational processing flow for automated VGI production}
    \label{fig:process}	
       % ALWAYS INCLUDE CAPTIONS IN YOUR FIGURES
	 % USE THE CONTENT WITHIN label FOR REFERENCING IT (USE \ref WITHIN THE TEXT)
\end{center}
\end{figure}

Once captured in the crawling stage, the microtexts are submitted to the geoparsing stage. In order to accomplish this stage, we used the GeoSEn Geoparser \cite{campelo_model_2009}, which is capable of detecting geographic terms in texts written in Portuguese. At this stage, all candidate locations are identified and then sent to the next stage, in which the text will be georeferenced. Figure \ref{fig:exampleFig1} illustrates the geoparsing and the georeferencing stages applied to a sample microtext.

In the upper half of Figure \ref{fig:exampleFig1}, it is possible to view all candidate locations identified in the sample microtext. The geoparser considers information such as the position of the term in the text and its length, that is, the number of words that form the term. The term position can be used to correlate spatial terms which may appear closely in the messages. In the case where the geoparsing of a microtext returns an empty set of candidate locations, this microtext is discarded and its VGI production process is interrupted.

\begin{figure}[b] % FIGURES SHOULD BE AT THE TOP [t] OR BOTTOM [b} OF PAGES
\begin{center}	% FIGURES SHOULD BE CENTERED
		\includegraphics[width=1.0\textwidth]{figs/Fig3-4.pdf} % YOU MAY SHRINK YOUR FIGURE WITH width
	\caption{Illustration of geoparsing and georeferencing stages applied to a sample microtext}
    \label{fig:exampleFig1}	
       % ALWAYS INCLUDE CAPTIONS IN YOUR FIGURES
	 % USE THE CONTENT WITHIN label FOR REFERENCING IT (USE \ref WITHIN THE TEXT)
\end{center}
\end{figure}

The georeferencing stage is illustrated in the lower half of Figure \ref{fig:exampleFig1}. In this stage, the candidate locations pass through a relevance evaluation in order to define the geographic scope of the microtext. We can notice that only one of the two candidate locations highlighted during the geoparsing stage was considered for the georeferencing of the sample microtext. Since one of these locations (the Dublin City) is inside the other one (Ireland), the geographic scope modeling algorithm returned just the most geographically precise one. For such, we used the GeoScope Modeler featured by the GeoSEn \cite{campelo_2008} in order to define the geographic scope and compute the relevance for its highest hierarchical levels, based on references found in lower levels. The GeoScope Modeler uses the GeoTree, a tree-based data structure which establishes the hierarchical relationship between places stored into the GeoSEn gazetteer. There are six hierarchical levels of places in the GeoTree: Country, Region, State, Mesoregion, Microregion and City. The first level represents regions with less geographic detail while the last represents regions with more geographic details.

Finally, the VGI production stage is responsible for creating the spatiotemporal marker that will be shared on the LBSN. The marker is basically formed by the original microtext captured from the social network, the latitude and longitude coordinates obtained from the georeferencing stage and the timestamp of the moment that the message was first published in the social network. To generate spatial markers, we compute the centroid points of the geometries corresponding to the georeferenced texts. Moreover, these markers are produced automatically and assigned to an particular category defined in the Crowd4City so that they can be easily distinguished from the categories originally managed by the LBSN users, such as education, transportation and security. Thus, this exclusive category emphasizes that the spatial marker was not produced by a LBSN user.

A software application called \emph{text2vgi} was implemented taking into account the whole flow illustrated in Figure \ref{fig:process}. The purpose of such application is to validate the proposed approach, to confirm our hypothesis (discussed in the introductory section), and to identify points which may possibly require further improvements in order to ensure the most spatially-accurate VGI production.

\section{Case Study: Producing VGI from non-previously-geotagged microtexts}

This section presents a case study using the \emph{text2vgi} software application with microtexts from Twitter.

\subsection{The Dataset}

Our case study was based on a dataset formed by 329,732 microtexts written in Portuguese, published on Twitter during the FIFA's Confederations Cup, which took place in Brazil in 2013. We adopted this dataset because it is related to an event in which people normally write terms that can be associated to geographic location, such as the name of the host cities. The methodology used for conducting this study is illustrated in Figure \ref{fig:processCS}.

\begin{figure}[b] % FIGURES SHOULD BE AT THE TOP [t] OR BOTTOM [b} OF PAGES
\vspace{0.5cm}
\begin{center}	% FIGURES SHOULD BE CENTERED
		\includegraphics[width=0.98\textwidth]{figs/Fig5.pdf} % YOU MAY SHRINK YOUR FIGURE WITH width
	\caption{The process flow for the case study}
    \label{fig:processCS}	
       % ALWAYS INCLUDE CAPTIONS IN YOUR FIGURES
	 % USE THE CONTENT WITHIN label FOR REFERENCING IT (USE \ref WITHIN THE TEXT)
\end{center}
\end{figure}

The Crawler implemented within the \emph{text2vgi} tool is responsible for capturing the messages and storing them in a local database. As the messages are received by the application, the geoparser is activated to identify the candidate locations. Then, the georeferencing module defines the geographic scope of the microtexts that presented at least one candidate location. Finally, the VGI production module concludes the work by creating the spatiotemporal marker.

\subsection{Manual Evaluation}

The whole set of microtexts processed by \emph{text2vgi} had to be evaluated in relation to the identified geographic locations and the spatiotemporal markers produced. By doing this, we could measure the quality of the VGI automatically produced. For such, we recruited some volunteers and trained them to use a web application we developed to help them conduct their analyses. This application displays to the user a random list of processed microtexts, which should be individually analyzed by the volunteer. For each microtext, the user should evaluate the following characteristics: the overall geoparsing accuracy (boolean, checked-stars[1 to 5]); whether the message refers to more than one place (boolean); whether the location assigned to the message by the geoparser could be more precise (boolean).

The question regarding the geoprocessing accuracy could have as answer the pair (TRUE, 5-CHECKED-STARS) in the cases in which the georeferencing was totally accurate according to the georeferencing strategy used, or in the cases where the VGI was not produced because the microtext had no references to geographic locations. The answer to this question could also be the pair (FALSE, [ 0 | 1 | 2 | 3 | 4 ]-CHECKED-STARS), depending on the geographic and semantic distances between the location identified by the system (through geoparsing the microtext) and the actual location the microtext refers to (manually identified by the volunteers). An example of geographic distance is the case where a microtext refers to the city of "Campina Grande", whilst the system defines the microtext's geographic scope as "Para\'{i}ba" (the Brazilian State this city is located in) or "Nordeste" (the Region this state is located in). The semantic distance is often related to the system's misinterpretation of a term in the text, such as the case where the microtext's geographic scope is defined as "Bahia" (a Brazilian State), while it actually refers to the "Bahia" Football Team.

The question about whether a microtext refers to more than one place could have TRUE as answer if the microtext refers to more than one geographic location and, therefore, the system should produce more than one spatiotemporal marker for the same microtext; or FALSE, otherwise. Finally, the question about whether a microtext can be more precise could have TRUE as answer if the microtext presents evidences that its geographic scope could be more precise than the city level, such as neighborhood names, streets, squares, parks or specific buildings, such as stadiums.

From the whole set of processed microtexts, 2.3\% (about 7,500) had at least one geographic location automatically assigned by \emph{text2vgi} and, consequently, could be used to produce spatiotemporal markers. The manual evaluation helped us to understand that this low rate was not due to inefficiency of our geoparser. Oppositely, it could be observed that the geoparser efficiency was satisfactory, but indeed the majority of the messages actually do not refer to any geographic location.

Considering the huge volume of microtexts of the dataset used in this study, a random sample of these microtexts needed to be defined so that it could be evaluated by volunteers. Such a sample consisted of 35,120 microtexts, with a trust level of 99\% and a sampling error of 0.65\%. In this sample, 975 microtexts (2.7\%) had a geographic location automatically assigned by \emph{text2vgi}, nearly the same proportion presented by the whole set of processed microtexts. Since the validation was performed by humans, we also considered a margin of error of 2.0\%.

\subsection{Discussion}

The mean time for processing each microtext by \emph{text2vgi} (from the moment of the capture of the message to the production of the spatiotemporal marker) was of 0.25 seconds. It took nearly 23 hours to process the whole dataset using a typical desktop computer, equipped with an Intel Core i7 processor, 8 GB of RAM and 1 single thread.

Considering the sample evaluated by the volunteers, Figure \ref{fig:results} presents the results for true positives, when the geographic location was identified correctly; false positives, when the geographic location was not identified correctly; true negatives, when there was no geographic scope assigned to the text, due to the lack of evidence in it; and false negatives, when no geographic scope was assigned to the text, but there was evidence for it.

\begin{figure}[b] % FIGURES SHOULD BE AT THE TOP [t] OR BOTTOM [b} OF PAGES
\begin{center}	% FIGURES SHOULD BE CENTERED
		\includegraphics[width=1.0\textwidth]{figs/Fig6.pdf} % YOU MAY SHRINK YOUR FIGURE WITH width
	\caption{Pie charts representing the percentages of each result: a) True/False Positives Relation, b) True/False Positives Relation considering the False Positives in five subdivisions, and c) True/False Negatives relation Figure 6a show that there was a balance between true and false}
    \label{fig:results}	
       % ALWAYS INCLUDE CAPTIONS IN YOUR FIGURES
	 % USE THE CONTENT WITHIN label FOR REFERENCING IT (USE \ref WITHIN THE TEXT)
\end{center}
\end{figure}

Figure \ref{fig:results}a shows that there was a balance between true and false positives, if we consider as true positives only the 100\% precise location detections. In Figure \ref{fig:results}b, it is possible to see the false positives in five classification levels. Each classification level represents how geographically close the false positive was to a true positive. It can be noticed that the false positives that are very far from the location expressed in the microtext (which received no stars in the accuracy question), represent only 24.6\% - about half the total number of false positives. Finally, in Figure \ref{fig:results}c, it is possible to observe a good result for true negatives. It confirms the lower rate of the processed microtexts which had at least one geographic location automatically assigned by \emph{text2vgi}: in fact there were many microtexts that refer to no locations.

The evaluation performed by the volunteers on the microtexts also revealed in the following results:

\begin{itemize}
  \item 16.6 \% of the microtexts presented evidences that more detailed geographic locations could have been identified. Thus a georeferencing strategy which can deal with a broader range of locations may improve the overall accuracy;
  \item 3.2 \% of the microtexts presented evidences that more than one geographic location could have been identified, producing therefore more than one spatiotemporal marker.
\end{itemize}

We have used four metrics for evaluating the overall performance of VGI produced automatically by text2vgi and validated by volunteers during this case study: Overall Accuracy (74.1\%); Precision (92.3\%); Recall (52.6\%); and the F-Measure (0.67). Among such achieved values, it can be noticed a low recall rate, which means that 47.4\% of the microtexts containing genuine references to geographic locations were not correctly interpreted by \emph{text2vgi}.

However, this result was already expected, since the geographic scope considered in our georeferencing strategy considers only locations related to the Brazilian political territorial division. Other relevant geographic references that might appear in the set of microtexts used (such as soccer stadiums and airports) ended up not being properly interpreted. However, it is important to highlight the good precision rate, which is justified by the number of true negatives.

\section{Related Work}

Research on VGI has prevailed in several parts of the world. Besides Computer Science, many correlated disciplines, such as Geography \cite{goodchild_citizens_2007}, Geographic Information Science \cite{jackson_synergistic_2010} and Human Factors \cite{parker_relevance_2011} have investigated issues concerning this kind of volunteered information.

One of the most representative VGI project is the OpenStreetMap (OSM) \cite{haklay_openstreetmap:_2008}. The OSM database consists of a significant collection of volunteered spatial data based on the Wikipedia collaborative model \cite{mooney_annotation_2012}. The OSM project has received many contributions from the community. \citeN{haklay_2010}, for instance, has focused on assessing VGI quality and how VGI can be reliable and usable. \citeN{ballatore_semantically_2011} focused on semantic relationships within OSM data. They highlight how OSM is spatially rich but semantically poor and investigate ways of linking OSM to other distributed repositories.

Besides the OSM project, several works have revealed VGI as a promising research field. \citeN{horita_use_2013} made a thorough literature review on VGI with the objective of verifying its applicability for aiding in disaster management. In that study, it was possible to observe that the VGI has been more frequently used in fires and floods. \citeN{havlik_future_2013} discussed VGI mobile applications concerning several aspects, such as functionalities and user experience. \citeN{ballatore_computing_2013} explored the semantic side of VGI and presented a technique for computing the semantic similarity of geographic terms in VGI based on their lexical definitions and using WordNet. The authors based themselves on the intuition that similar terms tend to be recursively defined by similar terms.

While the research on VGI is still relatively novel, the research on GIR has many studies focused on the identification and indexing of geographic locations through the application of Natural Language Processing (NLP) techniques. \citeN{rupp_customising_2013} discussed the customization of geoparsing and georeferencing tools to be applied in collections of historical texts. The authors made an analogy between the storage/indexing of files about the medieval era and the storage/indexing of Twitter feeds, and discussed questions involving standardization and use of gazetteers. There is no discussion about the spatial precision of the geoparsing, but this could be a motivational factor for such customization.

\citeN{liu_framework_2013} proposed the QGIR, Qualitative Geographic Information Retrieval, as a better option to deal with geographic information described in natural language in web documents. The authors argue the replacement of GIR by QGIR for cases where the place name and thematic representations are necessary, considering the use of semantic spatial relations and domain-specific ontologies. \citeN{freire_metadata_2011} described an approach for recognizing place names expressed in metadata of digital libraries. That approach should be better at capturing features of the non-structured text found in metadata records and at the exploration of the relevant information in the structured data of those records.

\citeN{lee_urban_2013} leveraged the amount of geotagged tweets available and performed analysis in order to discover behavioural patterns. They only apply geoparsing when the location metadata is in a raw text format. Such work keep the focus on users' location and time without deeply analyzing their textual messages in order to identify spatial areas eventually mentioned. Following the same direction, \citeN{magdy_taghreed:_2014} present a system for scalable querying, analyzing and visualizing geotagged microblogs which only considers pre-geotagged tweets. Neither geoparsing nor geotagging techniques are discussed or even applied by them. \citeN{hawelka_geo-located_2014} focused on human mobility through analysing geotagged Twitter messages containing explicit geographic coordinates. They could show the power of geotagged tweets, even with a tiny percentage of the messages, discovering human behaviours based on both space and time.

\citeN{watanabe_jasmine:_2011} proposed an automatic method of identifying geographic location in non-geotagged tweets. Such method is based on the clustering of messages according to the type of event, considering short time intervals, small geographic areas and geotagged tweets. Thus, geotagged tweets are used to allocate geotags in tweets which do not have the geographic tag yet. The authors did not consider the possibility of the geotagged tweets having a different geographic reference than the location discussed in the messages. In addition, users do not necessarily talk about their current locations. Therefore, there is a possibility of errors in the geographic precision and this must be considered. In a similar way, \citeN{jung_towards_2011} presented a method of analyzing sets of microtexts, aiming to identify contextual clusters of tweets. By establishing a contextual relation between the messages, a set of microtexts can be considered as a single document and make the process easier for the geoparsers. This task, however, can be very costly, depending on the volume of related tweets. In addition, there is also a possibility of errors in the geographic precision.

As we can notice, there are many researches addressing the power of location-related social media. While most authors focus on the user location through pre-geotagged metadata, others investigate the identification of geographic locations in social media messages focusing on exclusively on the text. However, the majority do not address specific issues of the Portuguese language. Furthermore, they do not address the domain of LBSNs and the aim of providing valuable information for such environments in an automated way. In this sense, our proposal comes as a solution to cover this gap.

\section{Conclusion and Further Work}

In this article, we presented an approach to the automated production of VGI based on geoparsing and georeferencing of texts published on the Web. Such approach was conceived with the objective of turning Web authors into volunteers in the VGI context, contributing to the indirect production of information in a Location-based Social Network.

A prototype, called text2vgi, was implemented with the goal of validating the ideas proposed by our approach. In order to evaluate the prototype in a real context, we carried out a case study using a set of microtexts in the Portuguese Language concerning a sporting event of large impact on the media, the 2013 FIFA's Confederations Cup, held in Brazil.

Overall, the achieved results were considered satisfactory. However, we have confirmed the need for improving the georeferencing strategy in order to increase the amount of VGI produced from microtexts, to improve the spatial accuracy of the spatiotemporal markers created and to achieve better results for the recall and F-Measure. It is important to consider points of interest (such as soccer stadiums and airports) and well known places in a city context. Thus, the automatically produced VGI will become more spatially precise and the user's experience in the LBSN will be improved.

As future work, we consider the implementation of georeferencing strategies to address the specific treatment of microtexts, such as the analysis of informal language. Furthermore, we will seek the development of heuristics that increase the precision of the locations detected, and consequently improve the F-Measure. Other future direction of our work is to improve our approach for producing VGI based on microtexts in other languages such as English, Spanish and French.


% INCLUDE BIBLIOGRAPHY WHICH MUST FOLLOW jidm.bst TEMPLATE
\bibliographystyle{jidm}
\bibliography{jidmb}
% For information on how to write bibliography entries,
% see file jidmb.bib


\begin{received}
\end{received}

\end{document}
