ºÝºÝߣ

ºÝºÝߣShare a Scribd company logo
tost@indf.net
. ????
20140522 uc?? ???_????_??
20140522 uc?? ???_????_??
¡°???? ?? ????? ??¡±
??? ??? ??(CARDINAL NUMBER)
1233 3?
* UV vs PV
UV (Unique Visitor) - ??? ???? ??? ???
- ?????? 1? ????, 10? ??
??, ???? 1? ??
PV (Page View) - ???? ?? ??
- ???? ???? ?? ?? ??? ?
PV??? ????? 1?? ????? ???? ?? ? ? ??
(?????? ???? PV???)
20140522 uc?? ???_????_??
?? ??
Count/
SUM
Max Min UC
2014-01 1, 2, 3, 4, 4 5 4 1 4
2014-02 1, 4, 1 3 4 1 2
2014-03 2, 6 2 6 2 2
2014-05 7, 5 2 7 5 2
2014-06 5, 1, 3 3 5 1 2
2014-01 ~ 2014-06 ? ???
?? ??
Count/
SUM
Max Min UC
2014-01
~
2014-06
1, 2, 3, 4, 4
1, 4, 1
2, 6
7, 5
5, 1, 3
15 6 1 7
2014-01 ~ 2014-06 ? ???.
?? ??
Count/
SUM
Max Min UC
2014-01 1, 2, 3, 4, 4 5 4 1 4
2014-02 1, 4, 1 3 4 1 2
2014-03 2, 6 2 6 2 2
2014-05 7, 5 2 7 5 2
2014-06 5, 1, 3 3 5 1 2
15 7 1
???, UC? ??? ?? ???? ?(value)? ?? ? ?? ?? ? ??
????
UC???
??? ??(Value)??
????? ?? ? ? ??
?,
?? ??? ?? ?? ?
?? ?? ??? ?? ???, ??????? ??? ???
?? ???? ?? ??? ??.
1
2
3
4
6
1
2
3
4
6
¡È
5? 2? 3?
?, ????? ??? ? ???? ???? ??.
???? ???? ??? ?.
12 3 46 1
2
3
4
6
¡È
UC=5? 3? 3?
2
2
+=
20140522 uc?? ???_????_??
select
count(distinct ???) as uc
from
t_table
Set<String> set = new HashSet<String>();
set.add(¡°a¡±);
set.add(¡°a¡±); //??
set.add(¡°xxx¡±);
set.add(¡°yyy¡±);
System.out.println( set.size() ); // print : 3
???
???? (set)
20140522 uc?? ???_????_??
?? M/R?? UC? ???? ??.
??? KEY??? ??? iterator<Value> ??? ???.
?, Value?? ??? ???? ??? ???? ?? ???? ???
??? KEY??? ??? ??????
Value??? ??????? ??!!!!
??? KEY ??? ?? ??? ?? ?? ????
???.
??? ?? ??? ??? ??? ??? ??? ??.
?, ¡®wc ¨Cl¡¯ ?? ???? ???? ??? ? ??.
??? ? A??? B??? UC ? ????
? M/R? ??? ??? ?????? ??? ? ?? ?? ???.
????? ??????
????? ??
A
B
E
C
D
F
G
20140522 uc?? ???_????_??
20140522 uc?? ???_????_??
- ??? INPUT? ???? Map???? ?? (??? ??)
- ??? ??? ??? ??? ???? ????? ??
( ??? ?? ?? ??? ?? ?? ?? ??? ?? ??)
- ????? ???? ?????? ??
- ??? ??? ?? ???? ?? ????? ?? (=bitset??)
MOD
(????)
[4,1,123,6,2,324,234, 6, 4,1,123]
[123,6,234]
[4,1,324]
[2]
????
(????)
merge ??
???? ??? ??? ???? ???.
Set? ???? ????? ?? UC? ??? ?? ??/????? UC
? ??? ??? ??? ???.
Set p = new Set();
Set c1 = new Set();
Set c2 = new Set();
p.addAll(c1);
p.addAll(c2);
p.size();
20140522 uc?? ???_????_??
HashSet? ?? integer??? ??? ?? ,
??? 4???? ???? 2.5?? ?? ??? ???,
??? ?????? 563?? ?? ?? ? ??
?? ???? ?? ??? ????,
??? ???? ?? ????? ???,
?? ??? ???? ??? ???? ?? ?? ???.
Bitmask ? ??? ???
??? 4byte? 32?? ??? ???
1024mb?? 5.3?? ??? ??? ?????.
???, ???? ????? ?.
??? ???? ??? ??? ?? max?? ??? ?? ??? ??.
¡®394762¡¯???, hashset??? 4byte, bitset??? 49345byte
Bitset? ??? ???? ??
1?? ??? HashSet, ? ??? ???? BitSet?? ????
Set? ?? ?.
???,
???? ??? 1??? ???
??? ????? ?? ??
?? ??? ?? ???? ?? ??? ???
??? bitset? ??? ??? ????? bitset? ???.
public class SmartWappedBitSet implements Set<Integer> {
private static final int DEFAULT_BLOCK_SIZE = 10000;//bit??
private int INIT_BLOCK_SIZE;
private final Map<Integer,BitSet> folder;
public SmartWappedBitSet(final int blockSize) {
this.INIT_BLOCK_SIZE = blockSize;
folder = new HashMap<Integer,BitSet>();
}
public SmartWappedBitSet() {
this(DEFAULT_BLOCK_SIZE);
}
@Override
public boolean add(Integer e) {
final int index = e / INIT_BLOCK_SIZE;
final int value = e % INIT_BLOCK_SIZE;
if (!folder.containsKey(index)) {
folder.put(index, new BitSet());
}
folder.get(index).set(value);
return true;
}
.
bitset
smart
bitset
???? ???
??? ??
.
block bitset
Hybridset
(HashSet + Bitset)
Hybridset2
(HashSet + smartbitset)
??? ? ????
???
??? GC? ? ?? ???.
20140522 uc?? ???_????_??
- ??? ???, HashSet? ??? ?? ??.
- Smartbitset??? ?? ?? bitset? ???, ????? ? ???
- ??? ??? ? ?? ??? ?? ??? ??????
- Hashset?? bitset??? loop? ??? ?? ??? ?? ??
20140522 uc?? ???_????_??
20140522 uc?? ???_????_??
- ???? 1/3 ?? ??? ??
- ???? 3? ??
- ????? loop, GC, ?? instance? ???
. ????

More Related Content

20140522 uc?? ???_????_??

  • 4. ¡°???? ?? ????? ??¡± ??? ??? ??(CARDINAL NUMBER)
  • 6. * UV vs PV UV (Unique Visitor) - ??? ???? ??? ??? - ?????? 1? ????, 10? ?? ??, ???? 1? ?? PV (Page View) - ???? ?? ?? - ???? ???? ?? ?? ??? ? PV??? ????? 1?? ????? ???? ?? ? ? ?? (?????? ???? PV???)
  • 8. ?? ?? Count/ SUM Max Min UC 2014-01 1, 2, 3, 4, 4 5 4 1 4 2014-02 1, 4, 1 3 4 1 2 2014-03 2, 6 2 6 2 2 2014-05 7, 5 2 7 5 2 2014-06 5, 1, 3 3 5 1 2 2014-01 ~ 2014-06 ? ???
  • 9. ?? ?? Count/ SUM Max Min UC 2014-01 ~ 2014-06 1, 2, 3, 4, 4 1, 4, 1 2, 6 7, 5 5, 1, 3 15 6 1 7 2014-01 ~ 2014-06 ? ???.
  • 10. ?? ?? Count/ SUM Max Min UC 2014-01 1, 2, 3, 4, 4 5 4 1 4 2014-02 1, 4, 1 3 4 1 2 2014-03 2, 6 2 6 2 2 2014-05 7, 5 2 7 5 2 2014-06 5, 1, 3 3 5 1 2 15 7 1 ???, UC? ??? ?? ???? ?(value)? ?? ? ?? ?? ? ?? ????
  • 11. UC??? ??? ??(Value)?? ????? ?? ? ? ?? ?, ?? ??? ?? ?? ?
  • 12. ?? ?? ??? ?? ???, ??????? ??? ??? ?? ???? ?? ??? ??. 1 2 3 4 6 1 2 3 4 6 ¡È 5? 2? 3?
  • 13. ?, ????? ??? ? ???? ???? ??. ???? ???? ??? ?. 12 3 46 1 2 3 4 6 ¡È UC=5? 3? 3? 2 2 +=
  • 15. select count(distinct ???) as uc from t_table Set<String> set = new HashSet<String>(); set.add(¡°a¡±); set.add(¡°a¡±); //?? set.add(¡°xxx¡±); set.add(¡°yyy¡±); System.out.println( set.size() ); // print : 3 ??? ???? (set)
  • 17. ?? M/R?? UC? ???? ??. ??? KEY??? ??? iterator<Value> ??? ???. ?, Value?? ??? ???? ??? ???? ?? ???? ??? ??? KEY??? ??? ?????? Value??? ??????? ??!!!!
  • 18. ??? KEY ??? ?? ??? ?? ?? ???? ???. ??? ?? ??? ??? ??? ??? ??? ??. ?, ¡®wc ¨Cl¡¯ ?? ???? ???? ??? ? ??. ??? ? A??? B??? UC ? ???? ? M/R? ??? ??? ?????? ??? ? ?? ?? ???.
  • 22. - ??? INPUT? ???? Map???? ?? (??? ??) - ??? ??? ??? ??? ???? ????? ?? ( ??? ?? ?? ??? ?? ?? ?? ??? ?? ??) - ????? ???? ?????? ?? - ??? ??? ?? ???? ?? ????? ?? (=bitset??)
  • 24. ???? ??? ??? ???? ???. Set? ???? ????? ?? UC? ??? ?? ??/????? UC ? ??? ??? ??? ???. Set p = new Set(); Set c1 = new Set(); Set c2 = new Set(); p.addAll(c1); p.addAll(c2); p.size();
  • 26. HashSet? ?? integer??? ??? ?? , ??? 4???? ???? 2.5?? ?? ??? ???, ??? ?????? 563?? ?? ?? ? ?? ?? ???? ?? ??? ????, ??? ???? ?? ????? ???, ?? ??? ???? ??? ???? ?? ?? ???.
  • 27. Bitmask ? ??? ??? ??? 4byte? 32?? ??? ??? 1024mb?? 5.3?? ??? ??? ?????. ???, ???? ????? ?. ??? ???? ??? ??? ?? max?? ??? ?? ??? ??. ¡®394762¡¯???, hashset??? 4byte, bitset??? 49345byte
  • 28. Bitset? ??? ???? ?? 1?? ??? HashSet, ? ??? ???? BitSet?? ???? Set? ?? ?.
  • 29. ???, ???? ??? 1??? ??? ??? ????? ?? ??
  • 30. ?? ??? ?? ???? ?? ??? ??? ??? bitset? ??? ??? ????? bitset? ???.
  • 31. public class SmartWappedBitSet implements Set<Integer> { private static final int DEFAULT_BLOCK_SIZE = 10000;//bit?? private int INIT_BLOCK_SIZE; private final Map<Integer,BitSet> folder; public SmartWappedBitSet(final int blockSize) { this.INIT_BLOCK_SIZE = blockSize; folder = new HashMap<Integer,BitSet>(); } public SmartWappedBitSet() { this(DEFAULT_BLOCK_SIZE); } @Override public boolean add(Integer e) { final int index = e / INIT_BLOCK_SIZE; final int value = e % INIT_BLOCK_SIZE; if (!folder.containsKey(index)) { folder.put(index, new BitSet()); } folder.get(index).set(value); return true; }
  • 33. .
  • 34. block bitset Hybridset (HashSet + Bitset) Hybridset2 (HashSet + smartbitset) ??? ? ???? ???
  • 35. ??? GC? ? ?? ???.
  • 37. - ??? ???, HashSet? ??? ?? ??. - Smartbitset??? ?? ?? bitset? ???, ????? ? ??? - ??? ??? ? ?? ??? ?? ??? ?????? - Hashset?? bitset??? loop? ??? ?? ??? ?? ??
  • 40. - ???? 1/3 ?? ??? ?? - ???? 3? ?? - ????? loop, GC, ?? instance? ???

Editor's Notes

  • #5: ????,???(»ù”µ,???:?cardinal number)???? ???????????? ???? ?? ????. ???? ??? ??? ?? ??? ??? ?? ???, ???? ????????? ??
  • #6: 3? ??????? 3??.
  • #16: ????? sorting? ?????? ??, Set? ???? ?? HashSet, TreeSet ??? ???. HashSet? O(1)?.
  • #17: ?????? ?? ???? ?? ??? ??, ??? Set? ???? ?? ??? ??? ??? ??? ???.
  • #18: ?? M/R?? UC? ???? ??. ??? KEY??? ??? iterator<Value> ??? ???. ?, KEY??? ???, VALUE??? ????? ?? Set? ???? ???? ???? ???? ???? ??? element? ????????. reduce?? value? uc? ?????? ???? ????? ???? ?????? ???. ??? Key? ??? grouping ?? ???, ??? KEY? ?? ????? ??? ???. ??? ????? UC? ??????? map???? ???? ???? ?????.
  • #22: ?????? ?? ???? ?? ??? ??, ??? Set? ???? ?? ??? ??? ??? ??? ???.
  • #23: ???, ???? ???? ???? ?? ????? ?????? ??? ?????? ???. ??? map/reduce??? ???? ?? ????? ??? map??? ???? ???. ?? UC? ????? ?????? ??? ???.
  • #27: ?????, Bitmap? ?? bitset? ??? ?? ?? ?? ??? ??. 5.3????. ??? bitset? ??? ??? value? ?? ??????? ???. ??? ??? ??? hashset, ??? bitset? ??? ?.
  • #28: 536870910 ?? ? 1?? ??... 1G? ?????? ???.
  • #29: 536870910 ?? ? 1?? ??... 1G? ?????? ???.
  • #30: 536870910 ?? ? 1?? ??... 1G? ?????? ???.
  • #31: 536870910 ?? ? 1?? ??... 1G? ?????? ???.
  • #33: bitset???? ??? ??????? ???? 1~5? ??? ??? ???? ?? ? 3.7????? uc? ??? ???. ???? ??? ?? ??, ????? ??? ???
  • #35: ?? hybridset?? bitset? smartbitset?? ??. ? ???? ? CPU? ?? ??. (????? 23G?? ????? 38G?? cpu? ???)
  • #39: hashset? bitset(or smartbitset)????? ????? ??. ??? hybridset?? ?? ??.