Word clouds

Remco Bloemen

2015-08-26

This word cloud is generated from the SU GIC NL competition entries.
This word cloud is generated from the SU GIC NL competition entries.

Finding and weighing keywords

text = Import["~/text.txt"] // ToLowerCase;
text = StringReplace[text, {"educational" → "education","teachers" → "teacher","learner" → "student","technologies" → "technology","learning" → "learn","leerling" → "student","studenten" → "student","students" → "student","questions" → "question","levels" → "level"}];
tally = Tally@Cases[StringSplit[text,Except@LetterCharacter],_?(StringLength@#>4&)];
tally = Cases[tally,_?(Last@#>15&)];
tally = Reverse@SortBy[tally,Last];
range = {Min@(Last/@tally),Max@(Last/@tally)};
common = {"the","of","and","to","in","I","that","was","his","he","it","with","is","for","as","had","you","not","be","her","on","at","by","which","have","or","from","this","him","but","all","she","they","were","my","are","me","one","their","so","an","said","them","we","who","would","been","will","no","when","there","if","more","out","up","into","do","any","your","what","has","man","could","other","than","our","some","very","time","upon","about","may","its","only","now","like","little","then","can","should","made","did","us","such","a","great","before","must","two","these","see","know","over","much","down","after","first","mr","good","men","worden","around","based","where","current","because","become","means","already","possible","through","every","using","kunnen","within","wordt","first","second","binnen","while"};
tally = DeleteCases[tally, w_/; MemberQ[common,w[[1]]]];

Adding color and rotation

words = Style[First@#, FontFamily -> "Times",
     FontColor ->
      Hue[RandomReal[], RandomReal[{.5, 1}], RandomReal[{.5, 1}]],
     FontSize -> (Last@Rescale[#, range, {12, 150}])] & /@ tally;

wordsimg =
  ImagePad[#, -3 -
      BorderDimensions[#]] & /@ (Image[
       Graphics[Text[Framed[#, FrameMargins -> 2]]]] & /@ words);
angles = Join[{0, 0, 0}, RandomReal[{-Pi/4, Pi/4}, 5],
   RandomReal[{-Pi/2, Pi/2}, Length[wordsimg] - 3 - 5]];
wordsimgRot =
  ImageRotate[##, Background -> White] & @@@
   Transpose[{wordsimg, angles}];

Tightly packing objects

iteration2[img1_, w_, fun_: (Norm[#1 - #2] &)] :=
 Module[{imdil, centre, diff, dimw, padding, padded1, minpos},
  dimw = ImageDimensions[w];
  padded1 = ImagePad[img1, {dimw[[1]] {1, 1}, dimw[[2]] {1, 1}}, 1];
  imdil =
   Binarize[
    ImageCorrelate[Binarize[ColorNegate[padded1], 0.05],
     Dilation[Binarize[ColorNegate[w], 0.05], 5]]];
  centre = ImageDimensions[padded1]/2;
  minpos =
   Reverse@Nearest[Position[Reverse[ImageData[imdil]], 0],
      Reverse[centre], DistanceFunction -> fun][[1]];
  Sow[minpos - centre];(*for creating vector plot*)
  diff = ImageDimensions[imdil] - dimw;
  padding[pos_] := Transpose[{#, diff - #} &@Round[pos - dimw/2]];
  ImagePad[#, (-Min[#] {1, 1}) & /@ BorderDimensions[#]] &@
   ImageMultiply[padded1, ImagePad[w, padding[minpos], 1]]]

poslist =
 Reap[img = Fold[iteration2, wordsimgRot[[1]], Rest[wordsimgRot]];][[
  2, 1]]

The crux of the algorithm is a call to ImageCorrelate.

Converting back to vector

We now have coordinates for the bitmap images. We re-use these coordinates to create new Text entries. This is to convert it back to a vector image.

Graphics[MapThread[
  Text[#1, Offset[#2, {0, 0}], {0, 0}, {Cos[#3], Sin[#3]}] &, {words,
   Prepend[poslist, {0, 0}], angles}]]

See also

https://mathematica.stackexchange.com/questions/2334/how-to-create-word-clouds