[Fencommits] fenfire-hs: irc2rdf fixes: unicode, turtle literals, dateTime vs. date
Tuukka Hastrup
Tuukka.Hastrup at iki.fi
Mon Mar 12 05:23:09 EET 2007
Mon Mar 12 05:21:42 EET 2007 Tuukka Hastrup <Tuukka.Hastrup at iki.fi>
* irc2rdf fixes: unicode, turtle literals, dateTime vs. date
diff -rN -u old-fenfire-hs/Irc2RDF.hs new-fenfire-hs/Irc2RDF.hs
--- old-fenfire-hs/Irc2RDF.hs 2007-03-12 05:23:09.000000000 +0200
+++ new-fenfire-hs/Irc2RDF.hs 2007-03-12 05:23:09.000000000 +0200
@@ -1,4 +1,4 @@
-
+{-# OPTIONS_GHC -fffi #-}
module Irc2RDF where
-- Copyright (c) 2006-2007, Benja Fallenstein, Tuukka Hastrup
@@ -25,12 +25,53 @@
import System.IO.Unsafe (unsafeInterleaveIO)
import Data.Char (toUpper, toLower)
+import Data.Char (ord)
+import Data.Bits ((.&.))
+
+import qualified Control.Exception
+
+import System.Glib.UTFString (newUTFString, readCString,
+ peekUTFString)
+import System.Glib.FFI (withCString, nullPtr, CString, CInt, Ptr)
+import System.IO.Unsafe (unsafePerformIO)
+
+foreign import ccall "g_utf8_validate" valid :: CString -> CInt ->
+ Ptr (CString) -> Bool
+
+-- XXX real toUTF isn't exported from System.Glib.UTFString
+toUTF :: String -> String
+toUTF s = unsafePerformIO $ newUTFString s >>= readCString
+
+fromUTF :: String -> String
+fromUTF s = unsafePerformIO $ Control.Exception.catch
+ (withCString s $ \cstr -> peekUTFString cstr >>= \s' ->
+ if (valid cstr (-1) nullPtr) then return s' -- force any exceptions
+ else return s )
+ (\_e -> return s) -- if any, keep the local encoding
+
+-- from gutf8.c used in g_utf8_validate
+isUnicode c' = let c = ord c' in
+ c < 0x110000 &&
+ c .&. 0xFFFFF800 /= 0xD800 &&
+ (c < 0xFDD0 || c > 0xFDEF) &&
+ c .&. 0xFFFE /= 0xFFFE
+
+-- XXX which unicode characters must be escaped?
+turtle_escaped :: Char -> String -> String
+turtle_escaped _ [] = []
+turtle_escaped c ('\\':xs) = '\\':'\\':turtle_escaped c xs
+turtle_escaped c (x:xs) | c == x
+ = '\\': c:turtle_escaped c xs
+turtle_escaped c ('\n':xs) = '\\': 'n':turtle_escaped c xs
+turtle_escaped c ('\r':xs) = '\\': 'r':turtle_escaped c xs
+turtle_escaped c ('\t':xs) = '\\': 't':turtle_escaped c xs
+turtle_escaped c ( x:xs) = x:turtle_escaped c xs
main = do [root,filepath] <- getArgs
'h':'t':'t':'p':':':'/':'/':_ <- return root
irc <- getContents
timestamps <- getTimeStamps
- mapM_ (uncurry $ handle root filepath) $ zip (lines irc)
+ mapM_ (uncurry $ handle root filepath) $ zip (map fromUTF$lines irc)
(uniquify timestamps)
getTimeStamps = do ~(TOD secs _picos) <- unsafeInterleaveIO getClockTime
@@ -49,7 +90,7 @@
handle :: String -> FilePath -> String -> (ClockTime, Maybe Integer) -> IO ()
handle root filepath line (clockTime,offset) = do
let (file,output) = irc2rdf root filepath (clockTime,offset) line
- maybe (return ()) ((flip appendFile) output) file
+ maybe (return ()) ((flip appendFile) (toUTF output)) file
irc2rdf :: String -> FilePath -> (ClockTime, Maybe Integer) -> String ->
(Maybe FilePath,String)
@@ -69,7 +110,8 @@
Maybe String -> [String] -> (Maybe FilePath, String)
triples root filepath (time,offset) (Just prefix) [cmd,target,msg]
| map toUpper cmd == "PRIVMSG",
- '#':channel <- map toLower target, channel `elem` ["fenfire","swig"]
+ '#':channel <- map toLower target, channel `elem` ["fenfire","sioc",
+ "swig","haskell"]
=
let file = channel ++ "-" ++ day
uri = root ++ file ++ "#" ++ second ++ maybe "" (('.':) . show) offset
@@ -80,21 +122,22 @@
"<irc://freenode/%23"++channel++"> <"++isContainerOf++"> <"++uri++">.\n"++
"<irc://freenode/%23"++channel++"> <"++rdftype++"> <"++forum++">.\n"++
"<"++uri++"> <"++created++"> "++
- show (day++"T"++second++"Z")++"^^<"++date++">.\n"++
+ t (day++"T"++second++"Z")++"^^<"++date++">.\n"++
"<"++uri++"> <"++hasCreator++"> <"++creator++">.\n"++
- "<"++uri++"> <"++hasContent++"> "++show msg++".\n"++
- "<"++uri++"> <"++label++"> "++show ("<"++nick++"> "++msg)++".\n"++
+ "<"++uri++"> <"++hasContent++"> "++t msg++".\n"++
+ "<"++uri++"> <"++label++"> "++t ("<"++nick++"> "++msg)++".\n"++
"<"++uri++"> <"++rdftype++"> <"++post++">.\n"++
- "<"++creator++"> <"++label++"> "++show nick++".\n"++
+ "<"++creator++"> <"++label++"> "++t nick++".\n"++
"<"++creator++"> <"++rdftype++"> <"++user++">.\n"
)
- where label = "http://www.w3.org/2000/01/rdf-schema#label"
+ where t str = "\"" ++ turtle_escaped '\"' str ++ "\""
+ label = "http://www.w3.org/2000/01/rdf-schema#label"
rdftype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
created = "http://purl.org/dc/terms/created"
isContainerOf = "http://rdfs.org/sioc/ns#is_container_of"
hasCreator = "http://rdfs.org/sioc/ns#has_creator"
hasContent = "http://rdfs.org/sioc/ns#has_content"
- date = "http://www.w3.org/2001/XMLSchema#date"
+ date = "http://www.w3.org/2001/XMLSchema#dateTime"
forum = "http://rdfs.org/sioc/ns#Forum"
post = "http://rdfs.org/sioc/ns#Post"
user = "http://rdfs.org/sioc/ns#User"
More information about the Fencommits
mailing list